In [1]:
%run oeai_py

StatementMeta(, , -1, Finished, Available)

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_KV_NAME"  # change for your KV name
keyvault_linked_service = "INSERT_LS_NAME"  

# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "bromcom-bronze", keyvault_linked_service, keyvault)
silver_path = oeai.get_secret(spark, "bromcom-silver", keyvault_linked_service, keyvault)
silver_ref_path = oeai.get_secret(spark, "oeai-silver", keyvault_linked_service, keyvault)
gold_path = oeai.get_secret(spark, "gold-path", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "bromcom-ids", keyvault_linked_service, keyvault)
subdirectories = school_ids_secret.split(",")
appid = oeai.get_secret(spark, "bromcom-appid", keyvault_linked_service, keyvault)
token = oeai.get_secret(spark, "bromcom-appsecret", keyvault_linked_service, keyvault)

In [None]:
# Define the mapping between JSON files and desired Delta table names
delta_table_name_mapping = {
    "Schools.json": "dim_Organisation",
    "Students.json": "dim_Student",
    "StudentFlatView.json": "dim_StudentExtended",
    "AttendanceSessions.json": "fact_AttendanceSummary",
    "Attendances.json": "fact_AttendanceSession"
}

In [None]:
column_mappings = {
    "Schools.json": {
        # drops
        "postCode": "drop", 
        "nationalityDescription": "drop",
        "town": "drop",
        "locality": "drop",
        "street": "drop",
        "flatNameNumber": "drop",
        "buildingNameNumber": "drop",
        "administrativeArea": "drop",
        "telephoneNumber": "drop",
        "faxNumber": "drop",
        "emailAddress": "drop",
        "leaid": "drop",
        "leaName": "drop",
        "councilTaxReference": "drop",
        "webSite": "drop",
        "headTitle": "drop",
        "headTeacherName": "drop",
        "currentSchool": "drop",
        "isPartOfMat": "drop",
        "dateJoinedMAT": "drop",
        # Renames
        "schoolID": {"new_name": "external_id"}, 
        "schoolName": {"new_name": "Organisation_Name"},  
        "establishmentNumber": {"new_name": "Establishment_Number"},  
        "uniqueReferenceNumber": {"new_name": "URN"},
        "leaNumber": {"new_name": "LA_Code"},
        # adds
        "add_columns": {
            "organisationkey": "",  
            "addresskey": "",  
            "UKPRN": "",
            "Organisation_Status": "Active",
            "last_updated": "",
        },
    },
    "Students.json": {   
        # drops
        "admissionNumber": "drop",
        "legalFullName": "drop",
        "salutation": "drop",
        "ethnicityID": "drop",
        "bloodGroup": "drop",
        "": "drop",
        "upn": "drop",
        "startDate": "drop",
        "endDate": "drop",
        "preferredFullName": "drop",
        "formerLastName": "drop",
        "nhsNumber": "drop",
        "ethnicity": "drop",
        "nationality": "drop",
        "photoFileName": "drop",
        "prefix": "drop",
        "giftedAndTalented": "drop",
        "englishProficiencyDescription": "drop",
        "birthCertificateSeen": "drop",
        "honours": "drop",
        "formerUPN": "drop",
        "uci": "drop",        
        "uln": "drop",
        "schoolUniqueReferenceNumber": "drop",
        "examNumber": "drop",
        "empty": "drop",        
        "yearGroup": "drop",
        "tutorGroup": "drop",
        "yearGroupID": "drop",
        "tutorGroupID": "drop",
        "connexionsAgreementStateCode": "drop",
        "connexionsAgreement": "drop",
        "numberOfPositiveEvents": "drop",
        "numberOfNegativeEvents": "drop",
        "nCyearActual": "drop",
        "isPregnantStudent": "drop",
        "isStudentPaidUniformAllowance": "drop",
        "familyStructureDescription": "drop",
        "nationalCurriculumDescription": "drop",
        "inYear": "drop",
        "laInYearText": "drop",
        "homeLA": "drop",
        "recoupmentLA": "drop",
        "twoYearOldFundTypeDescription": "drop",
        "adoptedFromCareTypeName": "drop",
        "adoptedFromCareTypeDescription": "drop",
        "pupilPremiumFlag": "drop",
        "everFSM6Flag": "drop",
        "chronologicalYeargroup": "drop",
        "mailingPoint": "drop",
        "thirtyHourCode": "drop",
        "hasDisabilityLivingAllowance": "drop",
        "hasFreeEarlyEducation": "drop",
        # Renames
        "studentID": {"new_name": "student_id"}, 
        "dateOfBirth": {"new_name": "Date_Of_Birth"}, 
        "preferredFirstName": {"new_name": "Forename"}, 
        "gender": {"new_name": "Gender"}, 
        "firstName": {"new_name": "Legal_Forename"}, 
        "lastName": {"new_name": "Legal_Surname"}, 
        "middleName": {"new_name": "Middle_Names"}, 
        "preferredlastName": {"new_name": "Surname"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studentkey": "",
            "last_updated": "",
        }
    },
    "AttendanceSessions.json": {
        # drops
        "created_at": "drop",
        "year": "drop",      
        "establishmentNumber": "drop",
        "lea": "drop",
        "leaDescription": "drop",
        "schoolName": "drop", 
        "attendanceStartDate": "drop",
        # Renames
        "attendanceMarks": {"new_name": "Attendance_Mark_String"}, 
        "studentID": {"new_name": "student_id"}, 
        "authorised": {"new_name": "Authorised_Absences"}, 
        "possible": {"new_name": "Possible_marks"}, 
        "attended": {"new_name": "Present"}, 
        "unauthorised": {"new_name": "Unauthorised_Absences"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "attendancesummarykey": "",
            "studentkey": "",
            "approved_education_activity": "",
            "last_updated": "",
            "id": "", 
            "late_after_registration": "", 
            "late_before_registration": "", 
            "missing_marks": "", 
            "unexplained_absences": "", 
            "updated_at": "", 
        }
    },
    "StudentFlatView.json": {
        # drops
        "yssaName": "drop",
        "yssaDescription": "drop",
        "usualMealDescription": "drop",
        "uniqueLearnerNumber": "drop",
        "unauthorisedAttendancePercentageClass": "drop",
        "unauthorisedAttendancePercentageAMPM": "drop",
        "unauthorisedAttendanceCountClass": "drop",
        "unauthorisedAttendanceCountAMPM": "drop",
        "uci": "drop",
        "tutorNames": "drop",
        "tutorGroupName": "drop",
        "travelModeDescription": "drop",
        "totalNumberOfExcludedDays": "drop",
        "totalMinutesLateClass": "drop",
        "totalMinutesLateAMPM": "drop",
        "systemDate": "drop",
        "surgeryName": "drop",
        "surgeryDescription": "drop",
        "studentTelephone": "drop",
        "studentSiblingListOnRoll": "drop",
        "studentSiblingList": "drop",
        "studentMainTown": "drop",
        "studentMainStreet": "drop",
        "studentMainPostCode": "drop",
        "studentMainFlatNameNumber": "drop",
        "studentMainBuildingNameNumber": "drop",
        "studentMainAdministrativeArea": "drop",
        "studentMainAddress": "drop",
        "admissionNumber": "drop",
        "adoptedFromCareTypeDescription": "drop",
        "adoptedFromCareTypeName": "drop",
        "ageYears": "drop",
        "ageYearsMonths": "drop",
        "allAbsencesAttendanceCountAMPM": "drop",
        "allAbsencesAttendanceCountClass": "drop",
        "allAbsencesAttendancePercentageClass": "drop",
        "allAbsencesPercentageAMPM": "drop",
        "attainmentLevelCode": "drop",
        "attainmentLevelDescription": "drop",
        "attendanceModeDescription": "drop",
        "attendancePercentageClassWithEA": "drop",
        "attendancePercentageClassnoEA": "drop",
        "authorisedAttendanceCountAMPM": "drop",
        "authorisedAttendanceCountClass": "drop",
        "authorisedAttendancePercentageAMPM": "drop",
        "authorisedAttendancePercentageClass": "drop",
        "birthCertificateSeen": "drop",
        "birthCountryDescription": "drop",
        "birthCountryName": "drop",
        "boarderStatusCode": "drop",
        "boarderStatusDescription": "drop",
        "carePlan": "drop",
        "carePlanStartDate": "drop",
        "careTypeName": "drop",
        "caringAuthority": "drop",
        "chronologicalYeargroup": "drop",
        "contact1Details": "drop",
        "contact1Email": "drop",
        "contact1Name": "drop",
        "contact1ParentalResponsibility": "drop",
        "contact1Relationship": "drop",
        "contact1Telephone": "drop",
        "contact2Details": "drop",
        "contact2Email": "drop",
        "contact2Name": "drop",
        "contact2ParentalResponsibility": "drop",
        "contact2Relationship": "drop",
        "contact2Telephone": "drop",
        "contact3Details": "drop",
        "contact3Email": "drop",
        "contact3Name": "drop",
        "contact3ParentalResponsibility": "drop",
        "contact3Relationship": "drop",
        "contact3Telephone": "drop",
        "countryArrivalDate": "drop",
        "dateOfBirth": "drop",
        "dateOfEntry": "drop",
        "daysToBirthday": "drop",
        "disabilityDescription": "drop",
        "disabilityName": "drop",
        "doctorFirstName": "drop",
        "doctorLastname": "drop",
        "dualRegisteredWith": "drop",
        "educationalActivityAttendanceCountAMPM": "drop",
        "educationalActivityAttendanceCountClass": "drop",
        "educationalActivityAttendancePercentageAMPM": "drop",
        "educationalActivityAttendancePercentageClass": "drop",
        "emergencyConsent": "drop",
        "examNumber": "drop",
        "firstName": "drop",
        "formerUPN": "drop",
        "genderCode": "drop",
        "genderDescription": "drop",
        "gypsyCodeName": "drop",
        "hasContactWithCorrespondence": "drop",
        "heSheCapitalised": "drop",
        "headOfHouse": "drop",
        "headOfYear": "drop",
        "heshe": "drop",
        "himHer": "drop",
        "hisHer": "drop",
        "hisHerCapitalised": "drop",
        "homeLanguage": "drop",
        "houseName": "drop",
        "isDataProcessingConsentGranted": "drop",
        "isDeprivationPupilPremium": "drop",
        "isEarlyYearPupilPremium": "drop",
        "isMobile": "drop",
        "kS1MathsScaledScore": "drop",
        "kS1ReadingScaledScore": "drop",
        "kS2MathsScaledScore": "drop",
        "kS2ReadingScaledScore": "drop",
        "lastName": "drop",
        "lateAttendanceCountAMPM": "drop",
        "lateAttendanceCountClass": "drop",
        "leavingReasonDescription": "drop",
        "legalFullName": "drop",
        "mainTutor": "drop",
        "mainTutorStaffCode": "drop",
        "medicalConditionTypesList": "drop",
        "medicalConditionsList": "drop",
        "medicalConditionsNotesList": "drop",
        "middleName": "drop",
        "monthofBirthName": "drop",
        "monthofBirthNumber": "drop",
        "nationalityDescription": "drop",
        "nationalityName": "drop",
        "notes": "drop",
        "numberOfExcludedSessions": "drop",
        "onReportDescription": "drop",
        "onReportReasonDescription": "drop",
        "parentalAddressee": "drop",
        "parentalSalutation": "drop",
        "personalEducationPlan": "drop",
        "phonicsScreeningCheckMark": "drop",
        "phonicsScreeningCheckOutcome": "drop",
        "plannedLearningHoursYear": "drop",
        "possiblesAttendanceCountAMPM": "drop",
        "possiblesAttendanceCountClass": "drop",
        "preferredFirstName": "drop",
        "preferredFullName": "drop",
        "preferredLastName": "drop",
        "presentAttendanceCountClass": "drop",
        "presentEACountAMPM": "drop",
        "presentPercentageWithEA": "drop",
        "previousPhaseSchool": "drop",
        "previousSchool": "drop",
        "privateNotes": "drop",
        "provisionDescription": "drop",
        "provisionName": "drop",
        "provisionStartDate": "drop",
        #"provisionEndDate": "drop",
        "recoupmentLA": "drop",
        "refugeeAsylumSeekerTypeName": "drop",
        "religiousAffiliationDescription": "drop",
        "reviewDate": "drop",
        "route": "drop",
        "secondLanguage": "drop",
        "serviceChildrenInEducationDescription": "drop",
        "sonDaughterCapitalised": "drop",
        "sondaughter": "drop",
        "sourceOfServiceChildrenInEducationCode": "drop",
        "sourceOfServiceChildrenInEducationDescription": "drop",
        "standardAdmissionTime": "drop",
        "standardLeavingTime": "drop",
        "studentContactCount": "drop",
        "studentContactList": "drop",
        "studentEmail": "drop",
        "serviceChildrenInEducationCode": "drop",
        "disadvantagedStudentsFlag": "drop",
        "enrolmentEndDate": "drop",
        "enrolmentStartDate": "drop",
        "enrolmentStateDescription": "drop",
        "fsmStartDate": "drop",
        "hasEverLeft": "drop",
        "isEligibleForFreeMeal": "drop",
        "isLeaver": "drop",
        "isLookedAfterPremium": "drop",
        "nonQualificationPlannedHours": "drop",
        "onReportStartDate": "drop",
        "provisionReviewDate": "drop",
        "qualificationPlannedHours": "drop",
        "senNeedList": "drop",
        "senPrimaryNeedDescription": "drop",
        "status": "drop",
        "studentMainLocality": "drop",
        # Renames
        "yearGroup": {"new_name": "Current_Year"}, 
        "upn": {"new_name": "UPN"}, 
        "Student_ID": {"new_name": "student_id"}, 
        "destinationSchool": {"new_name": "Leaver_Destination"}, 
        "dateOfLeaving": {"new_name": "Leaving_Date"},
        "ethnicityCode": {"new_name": "Ethnicity_Code"}, 
        "ethnicityDescription": {"new_name": "Ethnicity"}, 
        "firstLanguage": {"new_name": "First_Language"},
        "ealFlag": {"new_name": "English_As_Additional_Language"}, 
        "enrolmentStateName": {"new_name": "Enrolment_Status"},         
        "fsmIsActive": {"new_name": "Free_School_Meals"},
        "isServiceChildPremium": {"new_name": "Service_Child_Indicator"}, 
        "everFSM6Flag": {"new_name": "Free_School_Meals_6"}, 
        "inCareFlag": {"new_name": "In_LEA_Care"}, 
        "gntFlag": {"new_name": "Gifted_And_Talented_Status"}, 
        "premiumPupilFlag": {"new_name": "Pupil_Premium_Indicator"},
        "senPrimaryNeedCode": {"new_name": "SEN_Status"}, 
        "isAdoptedFromCarePremium": {"new_name": "Ever_In_Care"},
        # adds
        "add_columns": {
            "organisationkey": "",
            "studentextendedkey": "",
            "studentkey": "",
        }
    }, 
    "Attendances.json": {
        # drops
        "attendanceCommentInstant": "drop",
        "calendarEndDate": "drop",
        "calendarID": "drop",
        "calendarModelID": "drop",
        "calendarName": "drop",
        "collectionID": "drop",
        "recordedOn": "drop",
        "minute": "drop",
        "markSubcode": "drop",
        "markMeaningName": "drop",
        "locationID": "drop",
        "exportMark": "drop",
        "markMeaningDescription": "drop",
        # Renames
        "mark": {"new_name": "Mark"},  
        "attendanceComment": {"new_name": "Comment"}, 
        "calendarStartDate": {"new_name": "Date"}, 
        "recordedBy": {"new_name": "staff_id"}, 
        "attendanceID": {"new_name": "external_id"}, 
        "studentID": {"new_name": "student_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "attendancesessionkey": "",
            "studentkey": "",
            "session": "",
        }
    },
}    

In [None]:
# Dictionary to hold dataframes for each json file
json_dfs = {}
temp_dfs = {}
all_columns = {}
'''
    This code is to loop through each directory and compile all the individual schools jsons into
    a single json per endpoint.

    It creates json_dfs - a dictionary of the aggregated json files
'''
for subdir in subdirectories:
    school_dir = f"{bronze_path}{subdir}/"

    # Consider only JSON files that are in your mapping
    json_dirs = list(delta_table_name_mapping.keys())
    #print("json_dirs: " ,json_dirs)

    #print(list(delta_table_name_mapping.keys()))
    for json_dir in json_dirs:
        json_dir_path = f"{school_dir}{json_dir}/"
        try:
            temp_df = spark.read.json(json_dir_path)
            temp_df = temp_df.withColumn("school_id", lit(subdir))
            # Update the set of columns for the json_dir
            all_columns.setdefault(json_dir, set()).update(temp_df.columns)

            # Check if json_dir already exists in temp_dfs dictionary
            if json_dir in temp_dfs:
                # Align the schema of temp_df with existing DataFrame in temp_dfs
                existing_columns = all_columns[json_dir]
                #print(existing_columns)
                temp_df = oeai.add_missing_columns(temp_df, existing_columns)
                existing_df = oeai.add_missing_columns(temp_dfs[json_dir], temp_df.columns)

                # Perform the union operation
                try:
                    temp_df = oeai.match_column_types(existing_df, temp_df)
                    temp_dfs[json_dir] = existing_df[sorted(existing_df.columns)].unionByName(temp_df[sorted(temp_df.columns)])
                except Exception as e:
                    print("An unexpected error occurred:", e)

            else:
                # If not, simply assign temp_df to temp_dfs[json_dir]
                temp_dfs[json_dir] = temp_df
            
        except AnalysisException as e:
            print(f"Path does not exist: {json_dir_path}, skipping...")
            continue
        
        except Exception as e:
            print(f"An unexpected error occurred while processing {json_dir_path}: {e}")
            continue

if "StudentFlatView.json" in temp_dfs:
    # Perform the dropDuplicates operation - this is because the Bromcom enpoint supplies dupes for records with different QualificationHours
    temp_dfs["StudentFlatView.json"] = temp_dfs["StudentFlatView.json"].dropDuplicates(["school_id", "student_ID"])
else:
    # Handle the case where the DataFrame does not exist
    print("DataFrame 'StudentFlatView.json' does not exist in temp_dfs")

# Assign the final json_dfs outside the loops
json_dfs = temp_dfs

In [None]:
for json_name, df in json_dfs.items():
    if json_name in column_mappings:
        df = oeai.apply_column_mappings(df, column_mappings[json_name])
        json_dfs[json_name] = df  # Update the dictionary with the new DataFrame

In [None]:
# Modify and dataframes to match the Silver Schema
if "Attendances.json" in json_dfs:
    df_attendancesession = json_dfs['Attendances.json']
    df_attendancesession = df_attendancesession.withColumn("Hour", hour("Date"))
    # Step 2: Determine AM or PM and Update the Session Field
    df_attendancesession = df_attendancesession.withColumn("session", when(col("Hour") < 12, "AM").otherwise("PM"))
    df_attendancesession = df_attendancesession.drop("Hour")

    df_attendancesession = df_attendancesession.withColumn("Time", date_format(col("Date"), "HH:mm:ss"))
    df_attendancesession = df_attendancesession.withColumn("Date", date_format(col("Date"), "yyyy-MM-dd"))

    df_joined = []
    # Read the attendancecodes lookupup table
    attendancecodes_path = silver_ref_path + "dim_AttendanceCodes.csv"
    df_codes = spark.read.csv(attendancecodes_path, header=True, inferSchema=True)

    df_joined = df_attendancesession.join(
        df_codes.select('Mark', 'is_present', 'is_aea', 'is_auth', 'is_unauth', 'is_nr', 'is_poss', 'is_attend'),
        on='Mark',  # column name to join on, which must be present in both DataFrames
        how='inner'  # you can also use 'left', 'right', or 'outer' as needed
    )

    json_dfs['Attendances.json'] = df_joined
    json_dfs['Attendances.json'].printSchema()
else:
    # Handle the case where the DataFrame does not exist
    print("'Attendances.json' does not exist in json_dfs")

In [None]:
# Process each DataFrame and upsert it to the silver_path
for json_name, df in json_dfs.items():
    if json_name in delta_table_name_mapping and delta_table_name_mapping[json_name] != "":
        # Get the Delta table name from the mapping
        delta_table_name = delta_table_name_mapping[json_name]
        silver_table_path = f"{silver_path}/{delta_table_name}"
        uuid_column_name = oeai.get_uuid_column_name(delta_table_name)
        # Define the unique key column name
        unique_key_column = "unique_key"  
        if delta_table_name == "dim_Organisation":
            if DeltaTable.isDeltaTable(spark, silver_table_path):
                delta_table = DeltaTable.forPath(spark, silver_table_path)
                
                # Alias the Delta table as 'target' and rename 'organisationkey' to 'target_organisationkey'
                target_df = delta_table.toDF().select(unique_key_column, col("organisationkey").alias("target_organisationkey"))
                
                # Alias the source DataFrame as 'source'
                source_df = df.alias("source")
                
                # Perform a left join to find non-matched records
                df_with_keys = source_df.join(
                    target_df,
                    source_df[unique_key_column] == target_df[unique_key_column],
                    how="left"
                ).select(
                    # Select all columns from 'source' EXCEPT 'organisationkey' if it exists
                    *[source_df[col].alias(col) for col in source_df.columns if col != "organisationkey"],
                    # Coalesce to get 'organisationkey' from 'target' if it exists, or generate a new one
                    coalesce(col("target_organisationkey"), expr("uuid()")).alias("organisationkey")
                )
                # Now perform the merge operation
                delta_table.alias("target").merge(
                    df_with_keys.alias("source"),
                    f"target.{unique_key_column} = source.{unique_key_column}"
                ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
                
            else:
                # If the table does not exist, create it by writing the current DataFrame
                # First, add a column for the organisationkey for all records since this is a new table
                df = df.withColumn("organisationkey", expr("uuid()"))
                df.write.format("delta").mode("overwrite").save(silver_table_path)
        else:
            # Process student table before all others:
            if delta_table_name == "dim_Student":
                if DeltaTable.isDeltaTable(spark, silver_table_path):
                    delta_table = DeltaTable.forPath(spark, silver_table_path)
                    try:
                        update_columns = {col: f"source.{col}" for col in df.columns if col not in ['organisationkey', uuid_column_name]}
                        delta_table.alias("target").merge(
                            df.alias("source"),
                            f"target.{unique_key_column} = source.{unique_key_column}"
                        ).whenMatchedUpdate(set=update_columns  # Use the dictionary of columns to update
                        ).whenNotMatchedInsertAll().execute()
                    except Exception as e:
                        print(delta_table_name)
                        df.printSchema()
                        print(e)
                else:
                    # If the table does not exist, create it by writing the current DataFrame
                    df = df.withColumn(uuid_column_name, expr("uuid()"))
                    #df.printSchema()
                    # Load the dim_Organisation table to get the existing mappings
                    dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("school_id", "organisationkey")
                    # Perform a left join to find existing organisation keys
                    df_joined = df.alias("source").join(
                        dim_org_df.alias("dim"),
                        col("source.school_id") == col("dim.school_id"),
                        "left"
                    )
                    # Select all columns from df and only the 'organisationkey' from the dim_Organisation
                    df_with_keys = df_joined.select("source.*", col("dim.organisationkey").alias("dim_organisationkey"))
                    #df_with_keys.printSchema()
                    # Fill in the missing keys with UUIDs
                    df_complete = df_with_keys.withColumn(
                        "organisationkey",
                        when(col("dim_organisationkey").isNull(), expr("uuid()")).otherwise(col("dim_organisationkey"))
                    )
                    # Drop the 'dim_organisationkey' as it is no longer needed
                    df_final = df_complete.drop("dim_organisationkey")
                    #df_final.printSchema()
                    df_final.write.format("delta").mode("overwrite").save(silver_table_path)
            else: # if not the organisation or student table         
                
                if ('studentkey' in df.columns) and (delta_table_name != "dim_Student"):
                    # ------------------------------
                    # First, get the organisationkey
                    # ------------------------------

                    # Read the dim_Organisation table
                    dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("school_id", "organisationkey")
                    df = df.drop("organisationkey")
                    
                    # Perform a left join to find existing organisation keys
                    df_joined = df.alias("source").join(
                        dim_org_df.alias("dim"),
                        col("source.school_id") == col("dim.school_id"),
                        "left"
                    )
            
                    df_with_orgkey = df_joined.select(
                        *[col(f"source.{col_name}") for col_name in df.columns],
                        col("dim.organisationkey")
                    )

                    # --------------------------
                    # second, get the studentkey
                    # --------------------------
                    
                    dim_student_df = spark.read.format("delta").load(f"{silver_path}/dim_Student").select("student_id", "organisationkey", "studentkey")
                    
                    # Rename the 'studentkey' column from dim_student_df to avoid ambiguity
                    dim_student_df = dim_student_df.withColumnRenamed("studentkey", "dim_studentkey")
                    dim_student_df = dim_student_df.withColumnRenamed("student_id", "dim_student_id")
                    dim_student_df = dim_student_df.withColumnRenamed("organisationkey", "dim_organisationkey")

                    # Perform a left join
                    df_studjoined = df_with_orgkey.alias("source").join(
                        dim_student_df.alias("dim"),
                        (trim(lower(col("source.student_id"))) == trim(lower(col("dim.dim_student_id")))) &
                        (trim(lower(col("source.organisationkey"))) == trim(lower(col("dim.dim_organisationkey")))),
                        "left"
                    )

                    df_studjoined = df_studjoined.drop(col("source.studentkey"))
                    df_studjoined = df_studjoined.drop(col("dim.dim_student_id"))
                    df_studjoined = df_studjoined.drop(col("dim.dim_organisationkey"))
                  
                    df = df_studjoined
                    df = df.withColumnRenamed("dim_studentkey", "studentkey")

                # -------------------------------------------------------------------
                # Now that any table with student_id in it has studentkey continue...
                # -------------------------------------------------------------------

                # Set the update columns to update everything other than organisationkey and the unique_key
                update_columns = {col: f"source.{col}" for col in df.columns if col not in ['organisationkey', uuid_column_name]}

                if DeltaTable.isDeltaTable(spark, silver_table_path):
                    delta_table = DeltaTable.forPath(spark, silver_table_path)
                    try:
                        delta_table.alias("target").merge(
                            df.alias("source"),
                            f"target.{unique_key_column} = source.{unique_key_column}"
                        ).whenMatchedUpdate(set=update_columns  # Use the dictionary of columns to update
                        ).whenNotMatchedInsertAll().execute()

                    except Exception as e:
                        print("Error ",delta_table_name)
                        df.printSchema()
                        print(e)
                    
                else:
                    # If the table does not exist, create it by writing the current DataFrame
                    # First, generate a UUID for all records in the new UUID column
                    df = df.withColumn(uuid_column_name, expr("uuid()"))
                    if ('studentkey' not in df.columns): # because we have already added organisationkey to that
                
                        # Load the dim_Organisation table to get the existing mappings
                        dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("external_id", "organisationkey")

                        # Perform a left join to find existing organisation keys
                        df_joined = df.alias("source").join(
                            dim_org_df.alias("dim"),
                            col("source.school_id") == col("dim.external_id"),
                            "left"
                        )

                        # Select all columns from df and only the 'organisationkey' from the dim_Organisation
                        # Alias the dim_Organisation's organisationkey to avoid ambiguity
                        df_with_keys = df_joined.select("source.*", col("dim.organisationkey").alias("dim_organisationkey"))

                        # Fill in the missing keys with UUIDs
                        # Ensure to use the aliased column name 'dim_organisationkey' to avoid ambiguity
                        df_complete = df_with_keys.withColumn(
                            "organisationkey",
                            when(col("dim_organisationkey").isNull(), expr("uuid()")).otherwise(col("dim_organisationkey"))
                        )

                        # Drop the 'dim_organisationkey' as it is no longer needed
                        df = df_complete.drop("dim_organisationkey")

                    df.write.format("delta").mode("overwrite").save(silver_table_path)