In [None]:
import os
from pyspark.sql.types import *
from pyspark.sql.functions import col, lit, to_date

In [None]:
%run ees_env_var

In [None]:
dfs = {
    'ees_daily_22_23' : None,
    'ees_daily_23_24' : None,
    'ees_daily' : None,
    'ees_weekly' : None,
    'ees_ytd' : None,
}

if env_debug: print(f"{dfs.keys()}\n") #DEBUG

In [None]:
# Function: check_path_exists_databricks
# Description:
#     This function checks whether a given file or directory path exists in the Databricks filesystem.
#     It utilizes Databricks-specific utilities (`dbutils.fs.ls`) to list the contents of the specified path.
#     If the path exists, the function returns `True`; otherwise, it catches the exception and returns `False`.
#
# Parameters:
#     path_to_check (str): The file or directory path in the Databricks filesystem to check for existence.
#
# Returns:
#     bool: Returns `True` if the path exists, otherwise returns `False`.
#
# Example usage:
#     exists = check_path_exists_databricks("/mnt/data/my_folder")
#
# Notes:
#     - This function is designed specifically for Databricks and relies on the `dbutils` utility.
#     - If an exception occurs during the `dbutils.fs.ls` operation (e.g., path does not exist), 
#       it is caught and the function returns `False`.
#     - Use this function to verify the existence of directories or files before performing operations on them.

def check_path_exists_databricks(path_to_check):
    try:
        dbutils.fs.ls(path_to_check)  # For Databricks
        return True
    except:
        return False

In [None]:
# Function: check_path_exists_datalake
# Description:
#     This function checks whether a specified path exists in an Azure Data Lake or OneLake storage system
#     using PySpark. It interacts with the Hadoop FileSystem API to determine if the path exists. If the 
#     path is found, it returns `True`; otherwise, it returns `False`. In case of any exception during the 
#     operation, the function catches it and returns `False`.
#
# Parameters:
#     path_to_check (str): The file or directory path in the Azure Data Lake or OneLake storage to check.
#
# Returns:
#     bool: Returns `True` if the specified path exists in the Data Lake or OneLake, otherwise returns `False`.
#
# Example usage:
#     exists = check_path_exists_datalake("abfss://container@account.dfs.core.windows.net/folder")
#
# Notes:
#     - This function uses the Hadoop FileSystem API via PySpark to interact with Azure Data Lake or OneLake storage.
#     - It catches any exception (e.g., permission issues, incorrect path) and returns `False` for robustness.
#     - Ensure that the Spark session is properly configured and has access to the required storage system.
#     - Use this function to safely check for the existence of paths before performing operations in Data Lake storage.

def check_path_exists_datalake(path_to_check):
    try:
        hadoop_fs = spark._jvm.org.apache.hadoop.fs.FileSystem
        hadoop_conf = spark._jsc.hadoopConfiguration()
        path = spark._jvm.org.apache.hadoop.fs.Path(path_to_check)

        # Check if the path exists
        fs = hadoop_fs.get(hadoop_conf)
        if fs.exists(path):
            return True
        else:
            return False

    except Exception as e:
        return False

In [None]:
# Function: add_missing_columns
# Description:
#     This function ensures that all columns defined in a given schema are present in the provided DataFrame. 
#     If a column from the defined schema is missing in the DataFrame, the function adds that column with 
#     null values and casts it to the appropriate data type as defined in the schema.
#
# Parameters:
#     df (DataFrame): The input DataFrame that may have missing columns.
#     defined_schema (StructType): The schema that defines the expected structure of the DataFrame, 
#                                  including column names and data types.
#
# Returns:
#     DataFrame: A new DataFrame that includes all the columns from the defined schema. Missing columns 
#                are added with null values and cast to the corresponding data type.
#
# Example usage:
#     df = add_missing_columns(df, defined_schema)
#
# Notes:
#     - This function is useful when the input DataFrame may have a dynamic or incomplete set of columns
#       compared to the defined schema.
#     - It adds any missing columns as nulls, which can help maintain consistency when processing data
#       with varying schemas.

def add_missing_columns(df, defined_schema):

    for field in defined_schema:
        if field.name not in df.columns:
            df = df.withColumn(field.name, lit(None).cast(field.dataType))
    return df

In [None]:
# Function: conform_to_schema
# Description:
#     This function modifies the input DataFrame to conform to a given schema. It performs two main actions:
#     1. It removes any columns from the DataFrame that are not part of the defined schema.
#     2. It casts the remaining columns in the DataFrame to match the data types specified in the defined schema.
#
# Parameters:
#     df (DataFrame): The input DataFrame that needs to be conformed to the defined schema.
#     defined_schema (StructType): The schema that defines the expected column names and data types.
#
# Returns:
#     DataFrame: A new DataFrame that matches the defined schema by containing only the required columns,
#                with their data types cast appropriately.
#
# Example usage:
#     df = conform_to_schema(df, defined_schema)
#
# Notes:
#     - This function is helpful when ensuring that a DataFrame conforms to a specific schema, such as
#       when dealing with data from diverse sources or inconsistent formats.
#     - Columns not defined in the schema are dropped, and the remaining columns are cast to the data
#       types specified in the schema.
#     - Ensure that the input schema accurately reflects the desired structure to prevent data loss 
#       from dropped columns.

def conform_to_schema(df, defined_schema):
    # Drop any extra columns
    df = df.select([field.name for field in defined_schema if field.name in df.columns])

    # Cast the columns to match the defined schema
    for field in defined_schema:
        df = df.withColumn(field.name, col(field.name).cast(field.dataType))

    return df

In [None]:
# Define schema
defined_schema = StructType([
    StructField("time_period", IntegerType(), True),
    StructField("time_identifier", StringType(), True),
    StructField("region_name", StringType(), True),
    StructField("la_name", StringType(), True),
    StructField("old_la_code", StringType(), True),
    StructField("attendance_date", DateType(), True),
    StructField("school_type", StringType(), True),
    StructField("attendance_perc", FloatType(), True),
    StructField("authorised_absence_perc", FloatType(), True),
    StructField("unauthorised_absence_perc", FloatType(), True),
    StructField("illness_perc", FloatType(), True),
    StructField("appointments_perc", FloatType(), True),
    StructField("unauth_hol_perc", FloatType(), True),
    StructField("unauth_oth_perc", FloatType(), True),
    StructField("unauth_late_registers_closed_perc", FloatType(), True),
    StructField("unauth_not_yet_perc", FloatType(), True),
    StructField("auth_religious_perc", FloatType(), True),
    StructField("auth_study_perc", FloatType(), True),
    StructField("auth_grt_perc", FloatType(), True),
    StructField("auth_holiday_perc", FloatType(), True),
    StructField("auth_excluded_perc", FloatType(), True),
    StructField("auth_other_perc", FloatType(), True),
    StructField("pa_perc", FloatType(), True),
])

In [None]:
# Read Bronze files into Dataframes and conform to schema (drop extra, add new, apply data types)
for key, df in dfs.items():
    print(f"Loading [{key}] ...")
    
    # Read Bronze files into Dataframe
    df = spark.read.csv(os.path.join(env_paths["bronze"], key), header=True, inferSchema=True)
    # df.printSchema()

    # Transforms go here
    if "attendance_date" in df.columns:
        if isinstance(df.schema["attendance_date"].dataType, StringType):
            print(f"\tTransforming [{key}].[attendance_date]")
            df = df.withColumn("attendance_date", to_date(col("attendance_date"), "dd/MM/yyyy"))
    
    # Add missing columns from Schema
    df = add_missing_columns(df, defined_schema)

    # Apply schema to Dataframe (drop extra columns, Cast to datatype)
    df = conform_to_schema(df, defined_schema)

    # Write to Data dictionary
    dfs[key] = df
    print(f"\trow count:\t{dfs[key].count()}")

In [None]:
# Combine Daily with 23/24
dfs['ees_daily'] = dfs['ees_daily'] \
    .unionByName(dfs['ees_daily_22_23'], allowMissingColumns=True) \
    .unionByName(dfs['ees_daily_23_24'], allowMissingColumns=True) \

del dfs['ees_daily_22_23']
del dfs['ees_daily_23_24']

for key, df in dfs.items():
    print(f"{key}:\t{dfs[key].count()}")


In [None]:
# Write Dataframes to Silver
for key, df in dfs.items():
    print(f"{key}:\t{dfs[key].count()}")
    # print(f"{key}:")

    delta_path = os.path.join(env_paths['silver'], "fact_" + key)
    if env_debug: print(f"\t{delta_path}") #DEBUG

    if check_path_exists_databricks(delta_path) or check_path_exists_datalake(delta_path):
        ees_existing = spark.read.format("delta").load(delta_path)
        
        # Print row count before union
        print(f"\tExisting:\t{ees_existing.count()}")
      
        ees_combined = ees_existing.unionByName(dfs[key], allowMissingColumns=True).dropDuplicates()

    else:
        ees_combined = dfs[key].dropDuplicates()
        print(f"\tNo Previous Data")

    # Write the updated DataFrames back to the Delta tables
    print(f"\tWriting:\t{ees_combined.count()}")
    ees_combined.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(delta_path)