In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI("Fabric")

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE" # Fabric requires full URL eg "https://key_vault_name.vault.azure.net/"
keyvault_linked_service = "INSERT_YOUR_LINKED_SERVICE_NAME_HERE"  # Not required for Fabric.

# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "msgraph-rp-bronze", keyvault_linked_service, keyvault)
silver_path = oeai.get_secret(spark, "msgraph-silver", keyvault_linked_service, keyvault)
wonde_silver_path = oeai.get_secret(spark, "wonde-silver", keyvault_linked_service, keyvault)

In [None]:
# Define the mapping between JSON files and desired Delta table names
delta_table_name_mapping = {
    "readingprogress": "fact_graph_ReadingProgress",
    "users": "dim_graph_User",
    "classes": "dim_graph_Class",
    "assignments": "dim_graph_Assignment",
}

In [None]:
column_mappings = {
    "readingprogress": {
        # drops
        "": "drop", 
        # Renames
        #"id": {"new_name": "external_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",  
            "studentkey": "",  
            "readingprogresskey": "",  
            "external_id": "",
        }
    },
    "users": {
        # drops
        "": "drop", 
        # Renames
        "id": {"new_name": "external_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",  
            "studentkey": "",  
            "userkey": "",  
        }
    },
    "classes": {
        # drops
        "": "drop", 
        # Renames
        "id": {"new_name": "external_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",  
            "classkey": "",  
        }
    },
    "assignments": {
        # drops
        "": "drop", 
        # Renames
        "id": {"new_name": "external_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",  
            "assignmentkey": "",  
        }
    },
}

In [None]:
# Dictionary to hold dataframes for each json file
json_dfs = {}
temp_dfs = {}
all_columns = {}
'''
    This code is to loop through each directory and compile all the individual schools jsons into
    a single json per endpoint.

    It creates json_dfs - a dictionary of the aggregated json files
'''

json_dirs = list(delta_table_name_mapping.keys())

    #print(list(delta_table_name_mapping.keys()))
for json_dir in json_dirs:
    json_dir_path = f"{bronze_path}{json_dir}/"
    try:
        temp_df = spark.read.json(json_dir_path)
                
        # Update the set of columns for the json_dir
        all_columns.setdefault(json_dir, set()).update(temp_df.columns)

        # Check if json_dir already exists in temp_dfs dictionary
        if json_dir in temp_dfs:
            # Align the schema of temp_df with existing DataFrame in temp_dfs
            existing_columns = all_columns[json_dir]
            temp_df = oeai.add_missing_columns(temp_df, existing_columns)
            existing_df = oeai.add_missing_columns(temp_dfs[json_dir], temp_df.columns)
            # Perform the union operation
            try:
                temp_df = oeai.match_column_types(existing_df, temp_df)
                temp_dfs[json_dir] = existing_df[sorted(existing_df.columns)].unionByName(temp_df[sorted(temp_df.columns)])
            except Exception as e:
                print("An unexpected error occurred:", e)
        else:
            # If not, simply assign temp_df to temp_dfs[json_dir]
            temp_dfs[json_dir] = temp_df
    except AnalysisException as e:
        print(f"Path does not exist: {json_dir_path}, skipping...")
        continue
    except Exception as e:
        print(f"An unexpected error occurred while processing {json_dir_path}: {e}")
        continue
# Assign the final json_dfs outside the loops
json_dfs = temp_dfs

In [None]:
def apply_column_mappings(df, mappings):
        """
        Applies various column mappings to a DataFrame such as dropping, renaming, 
        and adding columns with default values.

        Args:
            df (DataFrame): The DataFrame to be modified.
            mappings (dict): A dictionary containing the mapping instructions. 
                             Keys are column names and values are actions or new names.

        Returns:
            DataFrame: The modified DataFrame after applying the mappings.
        """
        # Drop columns
        drop_cols = [col for col, action in mappings.items() if action == "drop"]
        df = df.drop(*drop_cols)

        # Rename columns or add new ones if they don't exist
        #print("Existing columns before renaming:", df.columns)
        rename_mappings = {col: details['new_name'] for col, details in mappings.items()
                        if isinstance(details, dict) and 'new_name' in details}
        existing_columns = df.columns
        for old_col, new_col in rename_mappings.items():
            if old_col in existing_columns:
                #print(f"Renaming {old_col} to {new_col}")
                df = df.withColumnRenamed(old_col, new_col)
            else:
                #print(f"Column {old_col} not found, adding {new_col} with None values")
                df = df.withColumn(new_col, lit(None))

        # Add new columns with default values
        add_columns = mappings.get("add_columns", {})
        for new_col, default_value in add_columns.items():
            df = df.withColumn(new_col, lit(default_value))

        return df

In [None]:
for json_name, df in json_dfs.items():
    if json_name in column_mappings:
        df = apply_column_mappings(df, column_mappings[json_name])
        json_dfs[json_name] = df  

In [None]:
# List of jobs next to get the dimensions in the correct schema.

try:
    if json_dfs['readingprogress'].count() > 0:
        try:
            df = json_dfs['readingprogress']
            df = df.withColumn("external_id", concat(col("submissionId"), col("submissionDateTime")))
            json_dfs['readingprogress'] = df
        except Exception as e:
            print(f"An error occurred: {e}")  
    else:
        print("DataFrame is empty, skipping the operation.")
except Exception as e:
    print(f"An error occurred: {e}")  

In [None]:
def add_missing_columns(df_to_adjust, df_reference):
    missing_columns = set(df_reference.columns) - set(df_to_adjust.columns)
    for column in missing_columns:
        df_to_adjust = df_to_adjust.withColumn(column, lit(None).cast(df_reference.schema[column].dataType))
    return df_to_adjust

In [None]:
# Process each DataFrame and upsert it to the silver_path
for json_name, df in json_dfs.items():
    
    if json_name in delta_table_name_mapping and delta_table_name_mapping[json_name] != "":
        #print(json_name)
        if df.count() > 0:
            # Get the Delta table name from the mapping
            delta_table_name = delta_table_name_mapping[json_name]
            silver_table_path = f"{silver_path}/{delta_table_name}"
            uuid_column_name = oeai.get_uuid_column_name(delta_table_name)

            # Define the unique key column name
            unique_key_column = "unique_key"  
    
            if delta_table_name == "dim_graph_User":

                dim_student_df = spark.read.format("delta").load(f"{wonde_silver_path}/dim_StudentExtended").select("Email", "organisationkey", "studentkey")
                # Rename the 'studentkey' column from dim_student_df to avoid ambiguity
                dim_student_df = dim_student_df.withColumnRenamed("studentkey", "dim_studentkey")
                
                # Perform a left join
                df_studjoined = df.alias("source").join(
                    dim_student_df.alias("dim"),
                    (trim(lower(col("source.mail"))) == trim(lower(col("dim.Email")))),
                    "left"
                )
                
                # Use when() to decide which studentkey to keep
                df_both_keys = df_studjoined.withColumn("studentkey", 
                                        when(col("dim.dim_studentkey").isNull(), col("source.studentkey"))
                                        .otherwise(col("dim.dim_studentkey"))
                                        ) \
                            .drop("dim.dim_studentkey") \
                            .select("source.*", "studentkey")

                df = df_both_keys
                # if the studentkey lookup has failed to find a student record then remove the related record for referential integrity
                df = df.filter((col("studentkey").isNotNull()) & (col("studentkey") != ""))

               
                
            # Set the update columns to update everything other than organisationkey and the unique_key
            update_columns = {col: f"source.{col}" for col in df.columns if col not in ['organisationkey', uuid_column_name]}
            # print(update_columns)

            if DeltaTable.isDeltaTable(spark, silver_table_path):
                delta_table = DeltaTable.forPath(spark, silver_table_path)
                target_df = delta_table.toDF()
                
                # Identify columns in source not in target
                new_columns = set(df.columns) - set(target_df.columns)
                
                if new_columns:
                    # Add new columns with nulls to the target DataFrame
                    for new_col in new_columns:
                        target_df = target_df.withColumn(new_col, lit(None).cast(df.schema[new_col].dataType))
                    
                    # Create a new Delta table with the updated schema from the target DataFrame
                    new_table_path = silver_table_path + "_new"
                    
                    if DeltaTable.isDeltaTable(spark, new_table_path):
                        print(f"Table at {new_table_path} exists. Deleting...")
                        deltaTable = DeltaTable.forPath(spark, new_table_path)
                        deltaTable.delete()
                    
                    target_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(new_table_path)
                    
                    # Prepare the new Delta table for merging
                    new_delta_table = DeltaTable.forPath(spark, new_table_path)
                    
                    # Adjust the source DataFrame to match the target schema, including any new columns
                    df_adjusted = add_missing_columns(df, target_df)
                    
                    # Perform the merge operation
                    new_delta_table.alias("target").merge(
                        df_adjusted.alias("source"),
                        f"target.external_id = source.external_id"
                    ).whenMatchedUpdate(set=update_columns
                    ).whenNotMatchedInsertAll().execute()
                    
                    # Overwrite the old table with the new table's data
                    spark.read.format("delta").load(new_table_path).write.format("delta").option("overwriteSchema", "true").mode("overwrite").save(silver_table_path)
                    
                    # Consider cleaning up the new_table_path if necessary

                else:
                    df_adjusted = add_missing_columns(df, target_df)
                    #df_adjusted.show(n=20, truncate=False)
                    # Perform the merge operation with the adjusted source DataFrame
                          
                    # Remove duplicates based on 'external_id' - note this is due to scholarpack holding multiple records for students that have left and rejoined with the same email address
                    df_adjusted = df_adjusted.dropDuplicates(["external_id"])

                   
                    delta_table.alias("target").merge(
                        df_adjusted.alias("source"),
                        "target.external_id = source.external_id"
                    ).whenMatchedUpdate(set=update_columns
                    ).whenNotMatchedInsertAll().execute()

            else:
                # If the table does not exist, create it by writing the current DataFrame
                # First, generate a UUID for all records in the new UUID column
                df = df.withColumn(uuid_column_name, expr("uuid()"))
                # debug, show 20 records
                #df.show(n=20, truncate=False)

                df.write.format("delta").mode("overwrite").save(silver_table_path)
         