In [None]:
%run oeai_py

In [None]:
# Create an instance of OEAI class and set the platform ("Synapse" or "Fabric")
oeai = OEAI()

In [None]:
# CHANGE VALUES FOR YOUR KEY VAULT
keyvault = "INSERT_YOUR_KEYVAULT_NAME_HERE"  
keyvault_linked_service = "INSERT_YOUR_KEYVAULT_LINKED_SERVICE_NAME_HERE" # not required for Fabric  

# Synapse OEA environment paths
bronze_path = oeai.get_secret(spark, "wonde-bronze", keyvault_linked_service, keyvault)
silver_path = oeai.get_secret(spark, "wonde-silver", keyvault_linked_service, keyvault)
gold_path = oeai.get_secret(spark, "gold-path", keyvault_linked_service, keyvault)
school_ids_secret = oeai.get_secret(spark, "school-ids", keyvault_linked_service, keyvault)
subdirectories = school_ids_secret.split(",")

In [None]:
# Define the mapping between JSON files and desired Delta table names
delta_table_name_mapping = {
    #"schools.json": "dim_Organisation",
    "students.json": "dim_Student",
    "students_extended.json": "dim_StudentExtended",
    "students_education.json": "",
    "students_contact_details.json": "dim_Address",
    "students_leaver.json": "",
    "students_leaver_extended.json": "",
    "students_leaver_education.json": "",
    "attendance_summaries.json": "fact_AttendanceSummary",
    #"attendance_session.json": "fact_AttendanceSession",
    "attendance_codes.json": "",
    "behaviours_students.json": "fact_Behaviour",
    "exclusions.json": "fact_Exclusion",
    "achievements_students.json": "fact_Achievement",
    #"subjects.json":"dim_Subject",
    #"classes.json":"",
    "groups.json":"dim_Group",
    "group_membership.json":"dim_GroupMembership",
    "aspects.json":"dim_Assessment",
    "resultsets.json":"dim_ResultSet",
    "results.json":"fact_Result",
}

In [None]:
column_mappings = {
    "schools.json": {
        # drops
        "timezone": "drop", 
        "mis": "drop",
        "address_address_line_1": "drop",
        "address_address_line_2": "drop",
        "address_address_town": "drop",
        "address_address_postcode": "drop",
        "address_address_country_code": "drop",
        "address_address_country_name": "drop",
        "extended_allows_writeback": "drop",
        "extended_has_timetables": "drop",
        "extended_has_lesson_attendance": "drop",
        "extended_audit_approved_at_date": "drop",
        "extended_audit_approved_at_timezone_type": "drop",
        "extended_audit_approved_at_timezone": "drop",
        "region_code": "drop",
        "region_domain": "drop",
        "region_school_url": "drop",
        "region_identifiers_la_code": "drop",
        "region_identifiers_establishment_number": "drop",
        "region_identifiers_urn": "drop",
        "school_id": "drop",
        # Renames
        "id": {"new_name": "external_id"}, 
        "name": {"new_name": "Organisation_Name"},  
        "establishment_number": {"new_name": "Establishment_Number"},  
        "urn": {"new_name": "URN"},
        "la_code": {"new_name": "LA_Code"},
        "phase_of_education": {"new_name": "Organisation_Type"},
        # adds
        "add_columns": {
            "organisationkey": "",  
            "addresskey": "",  
            "UKPRN": "",
            "Organisation_Status": "Active",
            "last_updated": "",
        }
    },
    "students.json": {
        # drops
        "created_at": "drop",
        "created_at_date": "drop",
        "created_at_timezone": "drop",
        "created_at_timezone_type": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "initials": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "updated_at_date": "drop",
        "updated_at_timezone": "drop",
        "updated_at_timezone_type": "drop",
        "upi": "drop",
        # Renames
        "date_of_birth_date": {"new_name": "Date_Of_Birth"}, 
        "forename": {"new_name": "Forename"}, 
        "gender": {"new_name": "Gender"}, 
        "id": {"new_name": "student_id"}, 
        "legal_forename": {"new_name": "Legal_Forename"}, 
        "legal_surname": {"new_name": "Legal_Surname"}, 
        "middle_names": {"new_name": "Middle_Names"}, 
        "surname": {"new_name": "Surname"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studentkey": "",
        }
    },
    "students_extended.json": {
        # drops
        "created_at": "drop",
        "created_at_date": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "date_of_birth_date": "drop",
        "forename": "drop",
        "gender": "drop",
        "initials": "drop",
        "legal_forename": "drop",
        "legal_surname": "drop",
        "middle_names": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "surname": "drop",
        "updated_at": "drop",
        "upi": "drop",
        "mis_id": "drop",
        "extended_details_data_fsm_review_date": "drop",
        "extended_details_data_premium_pupil_notes": "drop",
        "extended_details_data_boarding_status": "drop",
        "extended_details_data_application_status": "drop",
        "extended_details_data_birth_place": "drop",
        "extended_details_data_custody_details": "drop",
        "extended_details_data_dietary_needs": "drop",
        "extended_details_data_general_notes": "drop",
        "extended_details_data_marital_status": "drop",
        "extended_details_data_in_care_details": "drop",
        "extended_details_data_national_insurance_number": "drop",
        "extended_details_data_paramedical_support": "drop",
        "extended_details_data_permanent_resident": "drop",
        "extended_details_data_religion": "drop",
        "extended_details_data_responsible_care_authority": "drop",
        "extended_details_data_youth_support_services_agreement": "drop",
        "extended_details_data_custody_alert": "drop",
        "extended_details_data_next_of_kin": "drop",
        # Renames
        "extended_details_data_english_as_additional_language": {"new_name": "English_As_Additional_Language"}, 
        "extended_details_data_english_as_additional_language_status": {"new_name": "English_As_Additional_Language_Status"}, 
        "extended_details_data_enrolment_status": {"new_name": "Enrolment_Status"}, 
        "extended_details_data_ethnicity": {"new_name": "Ethnicity"}, 
        "extended_details_data_ethnicity_code": {"new_name": "Ethnicity_Code"}, 
        "extended_details_data_ever_in_care": {"new_name": "Ever_In_Care"}, 
        "extended_details_data_first_language": {"new_name": "First_Language"}, 
        "extended_details_data_free_school_meals": {"new_name": "Free_School_Meals"}, 
        "extended_details_data_free_school_meals_6": {"new_name": "Free_School_Meals_6"}, 
        "extended_details_data_gifted_and_talented_status": {"new_name": "Gifted_And_Talented_Status"}, 
        "extended_details_data_in_lea_care": {"new_name": "In_LEA_Care"}, 
        "extended_details_data_premium_pupil_indicator": {"new_name": "Pupil_Premium_Indicator"}, 
        "extended_details_data_sen_status": {"new_name": "SEN_Status"}, 
        "extended_details_data_service_children_indicator": {"new_name": "Service_Child_Indicator"}, 
        "extended_details_data_child_in_need": {"new_name": "Child_In_Need"}, 
        "extended_details_data_child_protection_plan": {"new_name": "Child_Protection_Plan"}, 
        "extended_details_data_nationality": {"new_name": "Nationality"}, 
        "extended_details_data_premium_pupil_eligible": {"new_name": "Pupil_Premium_Eligible"}, 
        "id": {"new_name": "student_id"},    
        "extended_details_data_leaver_destination": {"new_name": "Leaver_Destination"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studentextendedkey": "",
            "studentkey": "",
        }
    },
    "students_education.json": {
        # drops
        "created_at": "drop",
        "date_of_birth_date": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "education_details_data_admission_date_timezone": "drop",
        "education_details_data_admission_date_timezone_type": "drop",
        "education_details_data_learner_number": "drop",
        "education_details_data_part_time": "drop",
        "education_details_data_leaving_date_timezone": "drop",
        "education_details_data_leaving_date_timezone_type": "drop",
        "education_details_data_former_upn": "drop",
        "education_details_data_local_upn": "drop",
        "education_details_data_part_time_rate": "drop",
        "forename": "drop",
        "gender": "drop",
        "initials": "drop",
        "legal_forename": "drop",
        "legal_surname": "drop",
        "middle_names": "drop",
        "mis_id": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "surname": "drop",
        "updated_at": "drop",
        "upi": "drop",
        # Renames
        "education_details_data_admission_number": {"new_name": "Admission_Number"}, 
        "education_details_data_admission_date_date": {"new_name": "Admission_Date"}, 
        "education_details_data_current_nc_year": {"new_name": "Current_Year"}, 
        "education_details_data_upn": {"new_name": "UPN"}, 
        "education_details_data_leaving_date_date": {"new_name": "Leaving_Date"}, 
        "id": {"new_name": "student_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studenteducationkey": "",
            "studentkey": "",
        }
    },    
    "students_leaver.json": {
        # drops
        "created_at": "drop",
        "created_at_date": "drop",
        "created_at_timezone": "drop",
        "created_at_timezone_type": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "initials": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "updated_at_date": "drop",
        "updated_at_timezone": "drop",
        "updated_at_timezone_type": "drop",
        "upi": "drop",
        # Renames
        "date_of_birth_date": {"new_name": "Date_Of_Birth"}, 
        "forename": {"new_name": "Forename"}, 
        "gender": {"new_name": "Gender"}, 
        "id": {"new_name": "student_id"}, 
        "legal_forename": {"new_name": "Legal_Forename"}, 
        "legal_surname": {"new_name": "Legal_Surname"}, 
        "middle_names": {"new_name": "Middle_Names"}, 
        "surname": {"new_name": "Surname"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studentkey": "",
        }
    },
    "students_leaver_extended.json": {
        # drops
        "created_at": "drop",
        "created_at_date": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "date_of_birth_date": "drop",
        "forename": "drop",
        "gender": "drop",
        "initials": "drop",
        "legal_forename": "drop",
        "legal_surname": "drop",
        "middle_names": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "surname": "drop",
        "updated_at": "drop",
        "upi": "drop",
        "mis_id": "drop",
        "extended_details_data_fsm_review_date": "drop",
        "extended_details_data_premium_pupil_notes": "drop",
        "extended_details_data_boarding_status": "drop",
        "extended_details_data_application_status": "drop",
        "extended_details_data_birth_place": "drop",
        "extended_details_data_custody_details": "drop",
        "extended_details_data_dietary_needs": "drop",
        "extended_details_data_general_notes": "drop",
        "extended_details_data_marital_status": "drop",
        "extended_details_data_in_care_details": "drop",
        "extended_details_data_national_insurance_number": "drop",
        "extended_details_data_paramedical_support": "drop",
        "extended_details_data_permanent_resident": "drop",
        "extended_details_data_religion": "drop",
        "extended_details_data_responsible_care_authority": "drop",
        "extended_details_data_youth_support_services_agreement": "drop",
        "extended_details_data_custody_alert": "drop",
        "extended_details_data_next_of_kin": "drop",
        # Renames
        "extended_details_data_english_as_additional_language": {"new_name": "English_As_Additional_Language"}, 
        "extended_details_data_english_as_additional_language_status": {"new_name": "English_As_Additional_Language_Status"}, 
        "extended_details_data_enrolment_status": {"new_name": "Enrolment_Status"}, 
        "extended_details_data_ethnicity": {"new_name": "Ethnicity"}, 
        "extended_details_data_ethnicity_code": {"new_name": "Ethnicity_Code"}, 
        "extended_details_data_ever_in_care": {"new_name": "Ever_In_Care"}, 
        "extended_details_data_first_language": {"new_name": "First_Language"}, 
        "extended_details_data_free_school_meals": {"new_name": "Free_School_Meals"}, 
        "extended_details_data_free_school_meals_6": {"new_name": "Free_School_Meals_6"}, 
        "extended_details_data_gifted_and_talented_status": {"new_name": "Gifted_And_Talented_Status"}, 
        "extended_details_data_in_lea_care": {"new_name": "In_LEA_Care"}, 
        "extended_details_data_premium_pupil_indicator": {"new_name": "Pupil_Premium_Indicator"}, 
        "extended_details_data_sen_status": {"new_name": "SEN_Status"}, 
        "extended_details_data_service_children_indicator": {"new_name": "Service_Child_Indicator"}, 
        "extended_details_data_child_in_need": {"new_name": "Child_In_Need"}, 
        "extended_details_data_child_protection_plan": {"new_name": "Child_Protection_Plan"}, 
        "extended_details_data_nationality": {"new_name": "Nationality"}, 
        "extended_details_data_premium_pupil_eligible": {"new_name": "Pupil_Premium_Eligible"}, 
        "id": {"new_name": "student_id"},    
        "extended_details_data_leaver_destination": {"new_name": "Leaver_Destination"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studentextendedkey": "",
            "studentkey": "",
        }
    },
    "students_leaver_education.json": {
        # drops
        "created_at": "drop",
        "date_of_birth_date": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "education_details_data_admission_date_timezone": "drop",
        "education_details_data_admission_date_timezone_type": "drop",
        "education_details_data_learner_number": "drop",
        "education_details_data_part_time": "drop",
        "education_details_data_leaving_date_timezone": "drop",
        "education_details_data_leaving_date_timezone_type": "drop",
        "education_details_data_former_upn": "drop",
        "education_details_data_local_upn": "drop",
        "education_details_data_part_time_rate": "drop",
        "forename": "drop",
        "gender": "drop",
        "initials": "drop",
        "legal_forename": "drop",
        "legal_surname": "drop",
        "middle_names": "drop",
        "mis_id": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "surname": "drop",
        "updated_at": "drop",
        "upi": "drop",
        # Renames
        "education_details_data_admission_number": {"new_name": "Admission_Number"}, 
        "education_details_data_admission_date_date": {"new_name": "Admission_Date"}, 
        "education_details_data_current_nc_year": {"new_name": "Current_Year"}, 
        "education_details_data_upn": {"new_name": "UPN"}, 
        "education_details_data_leaving_date_date": {"new_name": "Leaving_Date"}, 
        "id": {"new_name": "student_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "studenteducationkey": "",
            "studentkey": "",
        }
    },    
    "students_contact_details.json": {
        # drops
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "contact_details_data_salutation": "drop",
        "contact_details_data_addresses_postal_county": "drop",
        "contact_details_data_addresses_postal_town": "drop",     
        "contact_details_data_addresses_postal_country": "drop",
        "contact_details_data_addresses_postal_district": "drop",
        "contact_details_data_addresses_postal_street": "drop",
        "contact_details_data_addresses_postal_apartment": "drop",
        "contact_details_data_addresses_postal_house_name": "drop",
        "contact_details_data_addresses_postal_house_number": "drop",
        "contact_details_data_addresses_work_postcode": "drop",
        "contact_details_data_addresses_work_country": "drop",
        "contact_details_data_addresses_work_county": "drop",
        "contact_details_data_addresses_work_town": "drop",
        "contact_details_data_addresses_work_district": "drop",
        "contact_details_data_addresses_work_street": "drop",
        "contact_details_data_addresses_work_apartment": "drop",
        "contact_details_data_addresses_work_house_name": "drop",
        "contact_details_data_addresses_work_house_number": "drop",
        "contact_details_data_emails_work": "drop",
        "contact_details_data_emails_home": "drop",
        "contact_details_data_emails_primary": "drop",
        "contact_details_data_phones_mobile": "drop",
        "contact_details_data_phones_work": "drop",
        "contact_details_data_phones_home": "drop",
        "contact_details_data_phones_primary": "drop",
        "contact_details_data_phones_phone": "drop",
        "updated_at": "drop",
        "created_at": "drop",
        "restored_at": "drop",
        "date_of_birth_timezone": "drop",
        "date_of_birth_timezone_type": "drop",
        "date_of_birth_date": "drop",
        "gender_identity": "drop",
        "gender": "drop",
        "legal_forename": "drop",
        "legal_surname": "drop",
        "middle_names": "drop",
        "forename": "drop",
        "surname": "drop",
        "title": "drop",
        "initials": "drop",
        "upi": "drop",
        # Renames
        "id": {"new_name": "student_id"}, 
        "contact_details_data_addresses_postal_postcode": {"new_name": "Postcode_Postal"}, 
        "contact_details_data_addresses_home_postcode": {"new_name": "Postcode_Home"}, 
        "contact_details_data_emails_email": {"new_name": "Email"}, 
        "contact_details_data_addresses_home_uprn": {"new_name": "UPRN"}, 
        "contact_details_data_addresses_home_country": {"new_name": "Country"},
        "contact_details_data_addresses_home_county": {"new_name": "County"},
        "contact_details_data_addresses_home_town": {"new_name": "Town"},
        "contact_details_data_addresses_home_district": {"new_name": "District"},
        "contact_details_data_addresses_home_street": {"new_name": "Street"},
        "contact_details_data_addresses_home_apartment": {"new_name": "Apartment"},
        "contact_details_data_addresses_home_house_name": {"new_name": "House_Name"},
        "contact_details_data_addresses_home_house_number": {"new_name": "House_Number"},
        # adds
        "add_columns": {
            "organisationkey": "",
            "addresskey": "",
            "studentkey": "",
            "Address_Type": "Primary",
        }
    },    
    "attendance_summaries.json": {
        # drops
        "created_at": "drop",
        "attendance_codes_#": "drop",
        "attendance_codes_/": "drop",
        "attendance_codes_B": "drop",
        "attendance_codes_C": "drop",
        "attendance_codes_D": "drop",
        "attendance_codes_E": "drop",
        "attendance_codes_F": "drop",
        "attendance_codes_G": "drop",
        "attendance_codes_H": "drop",
        "attendance_codes_I": "drop",
        "attendance_codes_J": "drop",
        "attendance_codes_L": "drop",
        "attendance_codes_M": "drop",
        "attendance_codes_N": "drop",
        "attendance_codes_O": "drop",
        "attendance_codes_P": "drop",
        "attendance_codes_R": "drop",
        "attendance_codes_S": "drop",
        "attendance_codes_T": "drop",
        "attendance_codes_U": "drop",
        "attendance_codes_V": "drop",
        "attendance_codes_W": "drop",
        "attendance_codes_X": "drop",
        "attendance_codes_Y": "drop",
        "attendance_codes_Z": "drop",
        "attendance_codes_\\": "drop",
        # Renames
        "approved_education_activity": {"new_name": "Approved_Education_Activity"}, 
        "attendance_not_required": {"new_name": "Attendance_Not_Required"}, 
        "authorised_absences": {"new_name": "Authorised_Absences"}, 
        "id": {"new_name": "external_id"}, 
        "late_after_registration": {"new_name": "Late_After_Registration"}, 
        "late_before_registration": {"new_name": "Late_Before_Registration"}, 
        "missing_marks": {"new_name": "Missing_marks"}, 
        "possible_marks": {"new_name": "Possible_marks"}, 
        "present": {"new_name": "Present"}, 
        "unauthorized_absences": {"new_name": "Unauthorised_Absences"}, 
        "unexplained_absences": {"new_name": "Unexplained_Absences"}, 
        "updated_at": {"new_name": "last_updated"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "attendancesummarykey": "",
            "studentkey": "",
            "Attendance_Mark_String": "",
        }
    },
    "behaviours_students.json": {
        # drops
        "created_at_timezone": "drop",
        "created_at_timezone_type": "drop",
        "incident_date_timezone": "drop",
        "incident_date_timezone_type": "drop",
        "recorded_date_timezone": "drop",
        "recorded_date_timezone_type": "drop",
        "student_data_created_at_date": "drop",
        "student_data_created_at_timezone": "drop",
        "student_data_created_at_timezone_type": "drop",
        "student_data_date_of_birth_date": "drop",
        "student_data_date_of_birth_timezone": "drop",
        "student_data_date_of_birth_timezone_type": "drop",
        "student_data_forename": "drop",
        "student_data_gender": "drop",
        "student_data_initials": "drop",
        "student_data_legal_forename": "drop",
        "student_data_legal_surname": "drop",
        "student_data_middle_names": "drop",
        "student_data_restored_at_date": "drop",
        "student_data_restored_at_timezone": "drop",
        "student_data_restored_at_timezone_type": "drop",
        "student_data_surname": "drop",
        "student_data_updated_at_timezone": "drop",
        "student_data_updated_at_timezone_type": "drop",
        "student_data_meta_action": "drop",
        "action": "drop",
        "action_date_date": "drop",
        "action_date_timezone": "drop",
        "action_date_timezone_type": "drop",
        "updated_at_timezone": "drop",
        "updated_at_timezone_type": "drop",
        "created_at_date": "drop",
        "mis_id": "drop",
        "parents_notified": "drop",
        "recorded_date_date": "drop",
        "student_data_meta_points": "drop",
        "student_data_meta_role": "drop",
        "student_data_mis_id": "drop",
        "student_data_updated_at_date": "drop",
        "student_data_upi": "drop",
        # Renames
        "student_data_id": {"new_name": "student_id"}, 
        "updated_at_date": {"new_name": "last_updated"}, 
        "class": {"new_name": "Class"}, 
        "comment": {"new_name": "Comment"}, 
        "id": {"new_name": "external_id"}, 
        "incident_date_date": {"new_name": "Incident_Date"}, 
        "location": {"new_name": "Location"}, 
        "total_points": {"new_name": "Total_Points"}, 
        "points": {"new_name": "Points"}, 
        "subject": {"new_name": "Subject"}, 
        "type": {"new_name": "Type"}, 
        "status": {"new_name": "Status"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "behaviourkey": "",
            "studentkey": "",
        }
    },
    "exclusions.json": {
        # drops
        "created_at": "drop",
        "discipline_committee_date_date": "drop",
        "discipline_committee_date_timezone": "drop",
        "end_date_timezone": "drop",
        "end_date_timezone_type": "drop",
        "mis_id": "drop",
        "start_date_timezone": "drop",
        "start_date_timezone_type": "drop",
        "student_data_created_at": "drop",
        "student_data_date_of_birth_date": "drop",
        "student_data_date_of_birth_timezone": "drop",
        "student_data_date_of_birth_timezone_type": "drop",
        "student_data_forename": "drop",
        "student_data_gender": "drop",
        "student_data_initials": "drop",
        "student_data_legal_forename": "drop",
        "student_data_legal_surname": "drop",
        "student_data_mis_id": "drop",
        "student_data_surname": "drop",
        "student_data_updated_at": "drop",
        "student_data_upi": "drop",
        "student_data_restored_at_date": "drop",
        "student_data_restored_at_timezone": "drop",
        "student_data_restored_at_timezone_type": "drop",
        "discipline_committee_date_timezone_type": "drop",
        "discipline_committee_representation_made": "drop",
        "student_data_middle_names": "drop",
        "end_date": "drop",
        "appeal_result_date_date": "drop",
        "appeal_result_date_timezone": "drop",
        "appeal_result_date_timezone_type": "drop",
        "appeal_reinstatement_date": "drop",
        "appeal_reinstatement_date_date": "drop",
        "appeal_reinstatement_date_timezone": "drop",
        "appeal_reinstatement_date_timezone_type": "drop",
        "discipline_committee_date": "drop",
        "discipline_committee_reinstatement_date": "drop",
        "discipline_committee_reinstatement_date_date": "drop",
        "discipline_committee_reinstatement_date_timezone": "drop",
        "discipline_committee_reinstatement_date_timezone_type": "drop",
        "discipline_committee_result": "drop",
        "student_data_gender_identity": "drop",
        "student_data_restored_at": "drop",
        "student_data_title": "drop",
        #"": "drop",
        # Renames
        "academic_year": {"new_name": "Academic_Year"}, 
        "agencies_involved": {"new_name": "Agencies_Involved"}, 
        "appeal_received": {"new_name": "Appeal_Received"}, 
        "appeal_result": {"new_name": "Appeal_Result"},
        "appeal_result_date": {"new_name": "Appeal_Result_Date"},
        "comments": {"new_name": "Comments"}, 
        "end_date_date": {"new_name": "End_Date"}, 
        "end_session": {"new_name": "End_Session"}, 
        "id": {"new_name": "external_id"}, 
        "days": {"new_name": "Days"}, 
        "reason": {"new_name": "Reason"}, 
        "reason_code": {"new_name": "Reason_Code"}, 
        "sessions": {"new_name": "Sessions"}, 
        "start_date_date": {"new_name": "Start_Date"}, 
        "start_session": {"new_name": "Start_Session"}, 
        "student_data_id": {"new_name": "student_id"},  
        "term": {"new_name": "Term"}, 
        "type": {"new_name": "Type"}, 
        "type_code": {"new_name": "Type_Code"}, 
        "updated_at": {"new_name": "last_updated"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "exclusionkey": "",
            "studentkey": "",
        }
    },
    "achievements_students.json": {
		# drops
		"achievement_date_timezone": "drop",
		"achievement_date_timezone_type": "drop",
		"created_at_date": "drop",
		"created_at_timezone": "drop",
		"created_at_timezone_type": "drop",
		"mis_id": "drop",
		"parents_notified": "drop",
		"recorded_date_date": "drop",
		"recorded_date_timezone": "drop",
		"recorded_date_timezone_type": "drop",
		"student_data_created_at_date": "drop",
		"student_data_created_at_timezone": "drop",
		"student_data_created_at_timezone_type": "drop",
		"student_data_date_of_birth_date": "drop",
		"student_data_date_of_birth_timezone": "drop",
		"student_data_date_of_birth_timezone_type": "drop",
		"student_data_forename": "drop",
		"student_data_gender": "drop",
		"student_data_initials": "drop",
		"student_data_legal_forename": "drop",
		"student_data_legal_surname": "drop",
		"student_data_meta_points": "drop",
		"student_data_middle_names": "drop",
		"student_data_mis_id": "drop",
		"student_data_restored_at_date": "drop",
		"student_data_restored_at_timezone": "drop",
		"student_data_restored_at_timezone_type": "drop",
		"student_data_surname": "drop",
		"student_data_updated_at_date": "drop",
		"student_data_updated_at_timezone": "drop",
		"student_data_updated_at_timezone_type": "drop",
		"student_data_upi": "drop",
		"updated_at_timezone": "drop",
		"updated_at_timezone_type": "drop",
		# Renames
		"achievement_date_date": {"new_name": "Achievement_Date"}, 
		"class": {"new_name": "Class"}, 
		"comment": {"new_name": "Comment"}, 
		"id": {"new_name": "external_id"}, 
		"points": {"new_name": "Points"}, 
		"student_data_id": {"new_name": "student_id"}, 
		"subject": {"new_name": "Subject"}, 
		"total_points": {"new_name": "Total_Points"}, 
		"type": {"new_name": "Type"}, 
		"updated_at_date": {"new_name": "last_updated"}, 
		# adds
		"add_columns": {
		"organisationkey": "",
		"achievementkey": "",
        "studentkey": "",
		}
	},
    "subjects.json": {
        # drops
        "created_at":"drop",
        "updated_at":"drop",
        "mis_id":"drop",
        # Renames
        "code": {"new_name": "Subject_Code"},
        "id": {"new_name": "external_id"}, 
        "name": {"new_name": "Subject_Name"}, 
        "subject": {"new_name": "Subject"}, 
        "active": {"new_name": "Active"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "subjectkey":""
        }
    },
    "classes.json": {
        #drop
        "mis_id":"drop",
        "created_at":"drop",
        "updated_at":"drop",
        "restored_at_date":"drop",
        "restored_at_timezone":"drop",
        "restored_at_timezone_type":"drop",

        # Renames
        "name": {"new_name": "Group_Name"}, 
        "description": {"new_name": "Group_Description"}, 
        "id": {"new_name": "external_id"},
        "subject": {"new_name": "Subject_Id"},
        # adds
        "add_columns": {
            "organisationkey": "",
             "studentgroupkey": "",
             "Group_Type":"",
             "Group_Code":""
        }
    },
    "groups.json": {
        #drop
        "mis_id":"drop",
        "created_at":"drop",
        "updated_at":"drop",
        "restored_at_date":"drop",
        "restored_at_timezone":"drop",
        "restored_at_timezone_type":"drop",

        # Renames
        "name": {"new_name": "Group_Name"}, 
        "code": {"new_name": "Group_Code"}, 
        "id": {"new_name": "external_id"},
        "description": {"new_name": "Group_Description"}, 
        "type": {"new_name": "Group_Type"},
        # adds
        "add_columns": {
            "organisationkey": "",
             "studentgroupkey": "",
             "Subject_Id":""
        }
    },
    "attendance_session.json": {
        # drops
        "date_timezone": "drop",
        "date_timezone_type": "drop",
        # Renames
        "comment": {"new_name": "Comment"}, 
        "date_date": {"new_name": "Date"}, 
        "employee": {"new_name": "staff_id"}, 
        "id": {"new_name": "external_id"}, 
        "session": {"new_name": "Session"}, 
        "student": {"new_name": "student_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "attendancesessionkey": "",
            "studentkey": "",
        }
    },
    "attendance_codes.json": {
        # drops
        # Renames
        "code": {"new_name": "Mark"}, 
        "description": {"new_name": "Description"}, 
        "id": {"new_name": "attendance_code"}, 
        "type": {"new_name": "Type"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "attendancecodekey": "",
        }
    },
    "groups.json": {
        # drops
        "created_at": "drop",
        "division": "drop",
        "meta": "drop",
        "notes": "drop",
        "restored_at": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "updated_at": "drop",
        # Renames
        "code": {"new_name": "Group_Code"}, 
        "name": {"new_name": "Group_Name"}, 
        "type": {"new_name": "Group_Type"}, 
        "description": {"new_name": "Group_Description"}, 
        "id": {"new_name": "Group_ID"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "groupkey": "",
        }
    },
    "group_membership.json": {
         # drops
        "created_at_date": "drop",
        "created_at_timezone": "drop",
        "created_at_timezone_type": "drop",
        "division": "drop",
        "meta": "drop",
        "notes": "drop",
        "restored_at": "drop",
        "restored_at_date": "drop",
        "restored_at_timezone": "drop",
        "restored_at_timezone_type": "drop",
        "updated_at": "drop",
        "updated_at_timezone": "drop",
        "updated_at_timezone_type": "drop",
        "student_data_forename": "drop",
        "student_data_gender": "drop",
        "student_data_gender_identity": "drop",
        "student_data_initials": "drop",
        "student_data_legal_forename": "drop",
        "student_data_legal_surname": "drop",
        "student_data_middle_names": "drop",
        "student_data_restored_at": "drop",
        "student_data_surname": "drop",
        "student_data_timezone": "drop",
        "student_data_timezone_type": "drop",
        "student_data_title": "drop",
        "student_data_upi": "drop",
        "updated_at_date": "drop",
        # Renames
        "code": {"new_name": "Group_Code"}, 
        "name": {"new_name": "Group_Name"}, 
        "type": {"new_name": "Group_Type"}, 
        "description": {"new_name": "Group_Description"}, 
        "id": {"new_name": "Group_ID"}, 
        "student_data_id": {"new_name": "student_id"}, 
        "student_data_date": {"new_name": "Student_Date"}, 
        "student_data_mis_id": {"new_name": "student_mis_id"}, 
        # adds
        "add_columns": {
            "organisationkey": "",
            "groupmembershipkey": "",
            "groupkey": "",
            "studentkey": "",
        }
    },
}

In [None]:
# Dictionary to hold dataframes for each json file
json_dfs = {}
temp_dfs = {}
all_columns = {}
'''
    This code is to loop through each directory and compile all the individual schools jsons into
    a single json per endpoint.

    It creates json_dfs - a dictionary of the aggregated json files
'''
for subdir in subdirectories:
    school_dir = f"{bronze_path}{subdir}/"

    # Consider only JSON files that are in your mapping
    json_dirs = list(delta_table_name_mapping.keys())

    #print(list(delta_table_name_mapping.keys()))
    for json_dir in json_dirs:
        json_dir_path = f"{school_dir}{json_dir}/"
        try:
            temp_df = spark.read.json(json_dir_path)
            temp_df = temp_df.withColumn("school_id", lit(subdir))

            # Update the set of columns for the json_dir
            all_columns.setdefault(json_dir, set()).update(temp_df.columns)

            # Check if json_dir already exists in temp_dfs dictionary
            if json_dir in temp_dfs:
                # Align the schema of temp_df with existing DataFrame in temp_dfs
                existing_columns = all_columns[json_dir]
                temp_df = oeai.add_missing_columns(temp_df, existing_columns)
                existing_df = oeai.add_missing_columns(temp_dfs[json_dir], temp_df.columns)
                # Perform the union operation
                try:
                    temp_df = oeai.match_column_types(existing_df, temp_df)
                    temp_dfs[json_dir] = existing_df[sorted(existing_df.columns)].unionByName(temp_df[sorted(temp_df.columns)])
                except Exception as e:
                    print("An unexpected error occurred:", e)
            else:
                # If not, simply assign temp_df to temp_dfs[json_dir]
                temp_dfs[json_dir] = temp_df
        except AnalysisException as e:
            print(f"Path does not exist: {json_dir_path}, skipping...")
            continue
        except Exception as e:
            print(f"An unexpected error occurred while processing {json_dir_path}: {e}")
            continue
# Assign the final json_dfs outside the loops
json_dfs = temp_dfs

In [None]:
def apply_column_mappings(df, mappings):
        """
        Applies various column mappings to a DataFrame such as dropping, renaming, 
        and adding columns with default values.

        Args:
            df (DataFrame): The DataFrame to be modified.
            mappings (dict): A dictionary containing the mapping instructions. 
                             Keys are column names and values are actions or new names.

        Returns:
            DataFrame: The modified DataFrame after applying the mappings.
        """
        # Drop columns
        drop_cols = [col for col, action in mappings.items() if action == "drop"]
        df = df.drop(*drop_cols)

        # Rename columns or add new ones if they don't exist
        #print("Existing columns before renaming:", df.columns)
        rename_mappings = {col: details['new_name'] for col, details in mappings.items()
                        if isinstance(details, dict) and 'new_name' in details}
        existing_columns = df.columns
        for old_col, new_col in rename_mappings.items():
            if old_col in existing_columns:
                #print(f"Renaming {old_col} to {new_col}")
                df = df.withColumnRenamed(old_col, new_col)
            else:
                #print(f"Column {old_col} not found, adding {new_col} with None values")
                df = df.withColumn(new_col, lit(None))

        # Add new columns with default values
        add_columns = mappings.get("add_columns", {})
        for new_col, default_value in add_columns.items():
            df = df.withColumn(new_col, lit(default_value))

        return df

In [None]:
#Assessment, Results and Resultset column processing

#Refine columns in aspects json file

# Assuming you have the DataFrame `json_dfs['aspects.json']`
df = json_dfs['aspects.json']

# Select the required columns and rename 'id' to 'aspect_id'
try:
    json_dfs['aspects.json'] = df.select(
        df.created_at_date,
        df.description,
        df.id.alias("aspect_id"),
        df.max_value,
        df.min_value,
        df.mis_id,
        df.name,
        df.school_id,
        df.type,
        df.unique_key,
        df.updated_at_date
    )
except AnalysisException as e:
    # Handle the case where the table does not exist or other AnalysisExceptions
    print(f"AnalysisException: {e}")
    print("Table is empty, skipping")
except Exception as e:
    # Handle other exceptions
    print(f"Exception: {e}")
    print("Table is empty, skipping")

#Refine columns in results json file


# Assuming you have the DataFrame `json_dfs['results.json']`
df_results = json_dfs['results.json']

# Select the required columns and rename specified columns
try:
    json_dfs['results.json'] = df_results.select(
        df_results.aspect.alias("aspect_id"),
        df_results.collection_date_date,
        df_results.created_at_date,
        df_results.grade_value,
        df_results.id.alias("result_id"),
        df_results.mis_id,
        df_results.result,
        df_results.result_date_date.alias("result_date"),
        df_results.resultset.alias("resultset_id"),
        df_results.school_id,
        df_results.student.alias("student_id"),
        df_results.unique_key,
        df_results.updated_at_date
    ).withColumn("organisationkey", lit("")).withColumn("studentkey", lit(""))

except AnalysisException as e:
    # Handle the case where the table does not exist or other AnalysisExceptions
    print(f"AnalysisException: {e}")
    print("Table is empty, skipping")
except Exception as e:
    # Handle other exceptions
    print(f"Exception: {e}")
    print("Table is empty, skipping")

# Assuming you have the DataFrame `json_dfs['resultsets.json']`
df_resultsets = json_dfs['resultsets.json']

# Select the required columns and rename 'id' to 'resultset_id'
try:
    json_dfs['resultsets.json'] = df_resultsets.select(
        df_resultsets.created_at_date,
        df_resultsets.end_date,
        df_resultsets.end_date_date,
        df_resultsets.external_id,
        df_resultsets.id.alias("resultset_id"),
        df_resultsets.locked,
        df_resultsets.mis_id,
        df_resultsets.module,
        df_resultsets.name,
        df_resultsets.school_id,
        df_resultsets.source,
        df_resultsets.start_date,
        df_resultsets.start_date_date,
        df_resultsets.supplier,
        df_resultsets.unique_key,
        df_resultsets.updated_at_date
    )
except AnalysisException as e:
    # Handle the case where the table does not exist or other AnalysisExceptions
    print(f"AnalysisException: {e}")
    print("Table is empty, skipping")
except Exception as e:
    # Handle other exceptions
    print(f"Exception: {e}")
    print("Table is empty, skipping")

In [None]:
for json_name, df in json_dfs.items():
    if json_name in column_mappings:
        df = apply_column_mappings(df, column_mappings[json_name])
        json_dfs[json_name] = df  

In [None]:
# List of jobs next to get the dimensions in the correct schema.

'''
***********************************************
  STUDENTS
**********************************************
'''
try:
    if json_dfs['students.json'].count() > 0:
        try:
            df_student = json_dfs['students.json']
            df_student_education = json_dfs['students_education.json']

            # Join students and student_education dataframes
            df_joined = df_student.join(
                df_student_education.select('unique_key', 'UPN', 'Current_Year'),
                on='unique_key',  
                how='inner'  
            )
            json_dfs['students.json'] = df_joined

        except Exception as e:
            print(f"An error occurred: {e}")

        try:
            df_student_extended = json_dfs['students_extended.json']
            df_student_education = json_dfs['students_education.json']
            df_student_contact_details = json_dfs['students_contact_details.json']

            # Join students_extended and student_education dataframes
            df_joined = df_student_extended.join(
                df_student_education.select('unique_key', 'Admission_Date', 'Leaving_Date', 'Admission_Number'),
                on='unique_key',
                how='inner'
            )
            
            if 'Leaving_Date' not in df_joined.columns:
                df_joined = df_joined.withColumn('Leaving_Date', lit(None))

            json_dfs['students_extended.json'] = df_joined

            try:
                # Join df_joined with students_contact_details dataframe
                df_joined = df_joined.join(
                    df_student_contact_details.select('unique_key', 'Postcode_Postal', 'Postcode_Home', 'Email'),
                    on='unique_key',
                    how='left'
                )
            
                if 'Postcode' not in df_joined.columns:
                    df_joined = df_joined.withColumn('Postcode', lit(None))

                if 'Postcode_Home' not in df_joined.columns:
                    df_joined = df_joined.withColumn('Postcode_Home', lit(None))

                json_dfs['students_extended.json'] = df_joined

            except Exception as e:
                print(f"An error occurred: {e}")

        except Exception as e:
            print(f"An error occurred: {e}")

    else:
        print("DataFrame is empty, skipping the operation.")

    # get the Year_Group
    try:
    # Assuming data is loaded correctly into these dataframes
        df_student = json_dfs['students.json']
        df_groupmembership = json_dfs['group_membership.json']

        # Join operation using both school_id and student_id, adding Year_Group
        df_joined = df_student.alias("student").join(
            df_groupmembership.alias("groupmembership"),
            (col("student.school_id") == col("groupmembership.school_id")) &
            (col("student.student_id") == col("groupmembership.student_id")) &
            (col("groupmembership.Group_Type") == "YEAR"),
            "left"
        ).select(
            "student.*",  # Select all columns from df_student
            col("groupmembership.Group_Name").alias("Year_Group")  # Rename and select the Group_Name as Year_Group
        )

        # Update the original dataframe in your dictionary
        json_dfs['students.json'] = df_joined

    except Exception as e:
        print(f"An error occurred: {e}")


except Exception as e:
    print(f"An error occurred: {e}") 

'''
***********************************************
  STUDENTS_LEAVER
**********************************************
'''
try:
    if json_dfs['students_leaver.json'].count() > 0:
        try:
            df_student = json_dfs['students_leaver.json']
            df_student_education = json_dfs['students_leaver_education.json']

            # Join students and student_education dataframes
            df_joined = df_student.join(
                df_student_education.select('unique_key', 'UPN', 'Current_Year'),
                on='unique_key',  
                how='inner'  
            )
            json_dfs['students_leaver.json'] = df_joined

        except Exception as e:
            print(f"An error occurred: {e}")

        try:
            df_student_leaver_extended = json_dfs['students_leaver_extended.json']
            df_student_leaver_education = json_dfs['students_leaver_education.json']

            # Join students_extended and student_education dataframes
            df_joined = df_student_leaver_extended.join(
                df_student_leaver_education.select('unique_key', 'Admission_Date', 'Leaving_Date', 'Admission_Number'),
                on='unique_key',
                how='inner'
            )
            
            if 'Leaving_Date' not in df_joined.columns:
                df_joined = df_joined.withColumn('Leaving_Date', lit(None))

            json_dfs['students_leaver_extended.json'] = df_joined
            
        except Exception as e:
            print(f"An error occurred: {e}")

    else:
        print("DataFrame is empty, skipping the operation.")

except Exception as e:
    print(f"An error occurred: {e}") 




'''
***********************************************
  ATTENDANCE SUMMARY
***********************************************
'''
try:
    from pyspark.sql import functions as F
    if json_dfs['attendance_summaries.json'].count() > 0:
        try:
            df_attendance_summary = json_dfs['attendance_summaries.json']

            calculated_sum = (
                coalesce(F.col('Authorised_Absences'), F.lit(0)) +
                coalesce(F.col('Unauthorised_Absences'), F.lit(0)) +
                coalesce(F.col('Attendance_Not_Required'), F.lit(0)) +
                coalesce(F.col('Present'), F.lit(0)) +
                coalesce(F.col('Approved_Education_Activity'), F.lit(0))
            )

            # Replace 'null' in Possible_marks with the sum of the other columns
            df_attendance_summary = df_attendance_summary.withColumn(
                "Possible_marks",
                when((col("Possible_marks").isNull()) | (col("Possible_marks") == "null") | (col("Possible_marks") == "None"), calculated_sum)
                .otherwise(col("Possible_marks"))
            )

            # Replace string 'null' with 0 in Possible_marks
            df_attendance_summary = df_attendance_summary.fillna({'Possible_marks': 0})
            df_attendance_summary = df_attendance_summary.withColumn(
                "Possible_marks",
                when(df_attendance_summary["Possible_marks"] == "null", 0)
                .otherwise(df_attendance_summary["Possible_marks"])
            )

            # Convert string columns to integer
            df_attendance_summary = df_attendance_summary.withColumn("Present", col("Present").cast("int"))
            df_attendance_summary = df_attendance_summary.withColumn("Approved_Education_Activity", col("Approved_Education_Activity").cast("int"))
            df_attendance_summary = df_attendance_summary.withColumn("Possible_marks", col("Possible_marks").cast("int"))

            # Calculate Percentage_Attendance and format to two decimal places
            df_attendance_summary = df_attendance_summary.withColumn("Percentage_Attendance", 
                            ((col("Present") + col("Approved_Education_Activity")) / col("Possible_marks")).cast(DecimalType(10, 4)))
            # Calculate Percentage_Auth and format to two decimal places
            df_attendance_summary = df_attendance_summary.withColumn("Percentage_Authorised_Absence", 
                            ((col("Authorised_Absences")) / col("Possible_marks")).cast(DecimalType(10, 4)))
            # Calculate Percentage_UnAuth and format to two decimal places
            df_attendance_summary = df_attendance_summary.withColumn("Percentage_Unauthorised_Absence", 
                            ((col("Unauthorised_Absences")) / col("Possible_marks")).cast(DecimalType(10, 4)))
            # Calculate Percentage_Unexp and format to two decimal places
            df_attendance_summary = df_attendance_summary.withColumn("Percentage_Unexplained_Absence", 
                            ((col("Unexplained_Absences")) / col("Possible_marks")).cast(DecimalType(10, 4)))

            df_attendance_summary = df_attendance_summary.withColumn(
                "Is_Persistently_Absent",
                when(col("Percentage_Attendance") < 0.9, 1).otherwise(0)
            )                   

            df_attendance_summary = df_attendance_summary.withColumn(
                "Is_Severely_Absent",
                when(col("Percentage_Attendance") < 0.5, 1).otherwise(0)
            )      

            #  attendance bands
            df_attendance_summary = df_attendance_summary.withColumn(
                "under_50",
                when(col("Percentage_Attendance") < 0.5, 1).otherwise(0)
            ) 

            df_attendance_summary = df_attendance_summary.withColumn(
                "50_to_70",
                when((col("Percentage_Attendance") >= 0.5) & (col("Percentage_Attendance") < 0.7), 1).otherwise(0)
            )

            df_attendance_summary = df_attendance_summary.withColumn(
                "70_to_80",
                when((col("Percentage_Attendance") >= 0.7) & (col("Percentage_Attendance") < 0.8), 1).otherwise(0)
            )

            df_attendance_summary = df_attendance_summary.withColumn(
                "80_to_90",
                when((col("Percentage_Attendance") >= 0.8) & (col("Percentage_Attendance") < 0.9), 1).otherwise(0)
            )

            df_attendance_summary = df_attendance_summary.withColumn(
                "90_to_92",
                when((col("Percentage_Attendance") >= 0.9) & (col("Percentage_Attendance") < 0.92), 1).otherwise(0)
            )

            df_attendance_summary = df_attendance_summary.withColumn(
                "92_to_95",
                when((col("Percentage_Attendance") >= 0.92) & (col("Percentage_Attendance") < 0.95), 1).otherwise(0)
            )

            df_attendance_summary = df_attendance_summary.withColumn(
                "95_to_98",
                when((col("Percentage_Attendance") >= 0.95) & (col("Percentage_Attendance") < 0.98), 1).otherwise(0)
            )

            df_attendance_summary = df_attendance_summary.withColumn(
                "above_98",
                when(col("Percentage_Attendance") >= 0.98, 1).otherwise(0)
            )             

            # Create a single 'Attendance_Bin' column
            df_attendance_summary = df_attendance_summary.withColumn(
                "Attendance_Bin",
                when(col("Percentage_Attendance") < 0.5, "under 50%")
                .when((col("Percentage_Attendance") >= 0.5) & (col("Percentage_Attendance") < 0.7), "50% to 70%")
                .when((col("Percentage_Attendance") >= 0.7) & (col("Percentage_Attendance") < 0.8), "70% to 80%")
                .when((col("Percentage_Attendance") >= 0.8) & (col("Percentage_Attendance") < 0.9), "80% to 90%")
                .when((col("Percentage_Attendance") >= 0.9) & (col("Percentage_Attendance") < 0.92), "90% to 92%")
                .when((col("Percentage_Attendance") >= 0.92) & (col("Percentage_Attendance") < 0.95), "92% to 95%")
                .when((col("Percentage_Attendance") >= 0.95) & (col("Percentage_Attendance") < 0.98), "95% to 98%")
                .when(col("Percentage_Attendance") >= 0.98, "above 98%")
                .otherwise("Unspecified")  # This is optional and can handle any data outside the expected ranges
            )


            json_dfs['attendance_summaries.json'] = df_attendance_summary
        except Exception as e:
            print(f"An error occurred: {e}")  
    else:
        print("DataFrame is empty, skipping the operation.")
except Exception as e:
    print(f"An error occurred: {e}") 


'''
**************************************************
    dim_Address
**************************************************
'''

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
try:

    def filter_address(*args):
        return ', '.join(filter(lambda x: x and x != "None", args))

    # Register UDF
    filter_address_udf = udf(filter_address, StringType())

    if json_dfs['students_contact_details.json'].count() > 0:
        try:
            df = json_dfs['students_contact_details.json']

            # Apply UDF to create the full address column
            df = df.withColumn("Address_Block", filter_address_udf(
                col("House_Name"),
                col("House_Number"),
                col("Apartment"),
                col("Street"),
                col("District"),
                col("Town"),
                col("County"),
                col("Country"),
                col("Postcode_Home")
            ))

            if 'Postcode' not in df.columns:
                df = df.withColumn('Postcode', lit(None))

            json_dfs['students_contact_details.json'] = df
        except Exception as e:
            print(f"An error occurred: {e}")  
    else:
        print("DataFrame is empty, skipping the operation.")
except Exception as e:
    print(f"An error occurred: {e}")  


'''
**************************************************
    attendance_session
**************************************************
'''
from pyspark.sql.functions import to_date
try:
    if json_dfs['attendance_session.json'].count() > 0:
        try:
            df_joined = []
            df_session = json_dfs['attendance_session.json']
            df_codes = json_dfs['attendance_codes.json']

            df_joined = df_session.join(
                df_codes.select('attendance_code', 'school_id', 'Mark'),
                on=['attendance_code', 'school_id'],  # column name to join on, which must be present in both DataFrames
                how='inner'  # you can also use 'left', 'right', or 'outer' as needed
            )

            df_joined = df_joined.drop('attendance_code')
            #df_joined = df_joined.drop('school_id')
            df_joined = df_joined.drop('Comment')
            df_joined = df_joined.drop('staff_id')
            df_joined = df_joined.drop('external_id')
            #df_joined = df_joined.drop('student_id')
            #df_joined = df_joined.drop('unique_key')
            df_joined = df_joined.drop('organisationkey')

            df_joined = df_joined.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd HH:mm:ss.SSSSSS"))

            json_dfs['attendance_session.json'] = df_joined
        except Exception as e:
            print(f"An error occurred: {e}")  
    else:
        print("DataFrame is empty, skipping the operation.")
except Exception as e:
    print(f"An error occurred: {e}")  

In [None]:
'''
***  student union with leavers ***
'''
from pyspark.sql.functions import col
# Identify columns in source not in target
df = json_dfs['students.json']
target_df = json_dfs['students_leaver.json']

# New columns in source that are not in target
new_columns_in_source = set(df.columns) - set(target_df.columns)
# New columns in target that are not in source
new_columns_in_target = set(target_df.columns) - set(df.columns)

# Add new columns with nulls to the target DataFrame
for col in new_columns_in_source:
    target_df = target_df.withColumn(col, lit(None).cast(df.schema[col].dataType))

# Add new columns with nulls to the source DataFrame
for col in new_columns_in_target:
    df = df.withColumn(col, lit(None).cast(target_df.schema[col].dataType))

# Ensure the columns are in the same order for both DataFrames
df = df.select(sorted(df.columns))
target_df = target_df.select(sorted(target_df.columns))

# Now perform the union operation
combined_df_by_name = df.unionByName(target_df)

# Update the dictionary with the combined DataFrame
json_dfs['students.json'] = combined_df_by_name

json_dfs['students.json'] = json_dfs['students.json'].dropDuplicates(['unique_key'])

'''
***  student_extended ***
'''

# Accessing the dataframes
df2 = json_dfs['students_extended.json']
target_df2 = json_dfs['students_leaver_extended.json']

# New columns in source that are not in target
new_columns2 = set(df2.columns) - set(target_df2.columns)
# New columns in target that are not in source
new_columns_in_target2 = set(target_df2.columns) - set(df2.columns)

# Drop the 'date_of_birth' column from the target DataFrame if it exists
#if 'date_of_birth' in target_df2.columns:
#    target_df2 = target_df2.drop('date_of_birth')

# Add new columns with nulls to the target DataFrame
for col in new_columns2:
    target_df2 = target_df2.withColumn(col, lit(None).cast(df2.schema[col].dataType))

# Add new columns with nulls to the source DataFrame
for col in new_columns_in_target2:
    df2 = df2.withColumn(col, lit(None).cast(target_df2.schema[col].dataType))

# Ensure the columns are in the same order for both DataFrames
df2 = df2.select(sorted(df2.columns))
target_df2 = target_df2.select(sorted(target_df2.columns))

# Now perform the union operation
combined_df_by_name2 = df2.unionByName(target_df2)

# Update the dictionary with the combined DataFrame
json_dfs['students_extended.json'] = combined_df_by_name2

json_dfs['students_extended.json'] = json_dfs['students_extended.json'].dropDuplicates(['unique_key'])

In [None]:
def add_missing_columns(df_to_adjust, df_reference):
    missing_columns = set(df_reference.columns) - set(df_to_adjust.columns)
    for column in missing_columns:
        df_to_adjust = df_to_adjust.withColumn(column, lit(None).cast(df_reference.schema[column].dataType))
    return df_to_adjust

In [None]:
from pyspark.sql.functions import col
# Process each DataFrame and upsert it to the silver_path
for json_name, df in json_dfs.items():
    
    if json_name in delta_table_name_mapping and delta_table_name_mapping[json_name] != "":
        if df.count() > 0:
            # Get the Delta table name from the mapping
            delta_table_name = delta_table_name_mapping[json_name]
            silver_table_path = f"{silver_path}/{delta_table_name}"
            uuid_column_name = oeai.get_uuid_column_name(delta_table_name)
            print(delta_table_name)
            # Define the unique key column name
            unique_key_column = "unique_key"  

            if delta_table_name == "dim_Organisation":
                if DeltaTable.isDeltaTable(spark, silver_table_path):
                    delta_table = DeltaTable.forPath(spark, silver_table_path)
                    
                    # Alias the Delta table as 'target' and rename 'organisationkey' to 'target_organisationkey'
                    target_df = delta_table.toDF().select(unique_key_column, col("organisationkey").alias("target_organisationkey"))
                    
                    # Alias the source DataFrame as 'source'
                    source_df = df.alias("source")
                    
                    # Perform a left join to find non-matched records
                    df_with_keys = source_df.join(
                        target_df,
                        source_df[unique_key_column] == target_df[unique_key_column],
                        how="left"
                    ).select(
                        # Select all columns from 'source' EXCEPT 'organisationkey' if it exists
                        *[source_df[col].alias(col) for col in source_df.columns if col != "organisationkey"],
                        # Coalesce to get 'organisationkey' from 'target' if it exists, or generate a new one
                        coalesce(col("target_organisationkey"), expr("uuid()")).alias("organisationkey")
                    )
                    
                    # Now perform the merge operation
                    delta_table.alias("target").merge(
                        df_with_keys.alias("source"),
                        f"target.{unique_key_column} = source.{unique_key_column}"
                    ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
                    
                else:
                    # If the table does not exist, create it by writing the current DataFrame
                    # First, add a column for the organisationkey for all records since this is a new table
                    df = df.withColumn("organisationkey", expr("uuid()"))
                    df.write.format("delta").mode("overwrite").save(silver_table_path)
            else:
                # Process student table before all others:
                if delta_table_name == "dim_Student":
                    if DeltaTable.isDeltaTable(spark, silver_table_path):
                        delta_table = DeltaTable.forPath(spark, silver_table_path)
                        try:
                            # ------------------------------
                            # First, get the organisationkey
                            # ------------------------------
                            # Read the dim_Organisation table
                            dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("external_id", "organisationkey")
                            
                            df = df.drop("organisationkey")

                            # Perform a left join to find existing organisation keys
                            df_joined = df.alias("source").join(
                                dim_org_df.alias("dim"),
                                col("source.school_id") == col("dim.external_id"),
                                "left"
                            )

                            # df_joined = df.drop("external_id")

                            # Select all columns from df (source) and only the 'organisationkey' from dim_Organisation (dim)
                            # Alias the dim_Organisation's organisationkey to avoid ambiguity
                            
                            df_with_orgkey = df_joined.select(
                                *[col(f"source.{col_name}") for col_name in df.columns],
                                col("dim.organisationkey")
                            )

                            # ------------------------------
                            # Second, update or insert the record
                            # ------------------------------
                            # generate a studentkey, this will be ignored if the record matches on the uniquekey column but inserted if it is a new record
                            df_with_orgkey = df_with_orgkey.withColumn(uuid_column_name, expr("uuid()"))

                            target_columns = DeltaTable.forPath(spark, silver_table_path).toDF().columns
                            # Filter source DataFrame columns based on target DataFrame columns
                            source_columns = df_with_orgkey.columns
                            columns_to_keep = [col for col in source_columns if col in target_columns]
                            # Select only the matching columns from the source DataFrame
                            df_adjusted = df_with_orgkey.select(*columns_to_keep)

                            # Ensure update_columns only contains columns present in both DataFrames and not in the exclusion list
                            update_columns = {col: f"source.{col}" for col in df_adjusted.columns if col not in ['organisationkey', uuid_column_name]}
                            
                            delta_table.alias("target").merge(
                                df_adjusted.alias("source"),
                                f"target.{unique_key_column} = source.{unique_key_column}"
                            ).whenMatchedUpdate(set=update_columns
                            ).whenNotMatchedInsertAll().execute()

                        except Exception as e:
                            print(delta_table_name)
                            df.printSchema()
                            print(e)
                    else:
                        # If the table does not exist, create it by writing the current DataFrame
                        df = df.withColumn(uuid_column_name, expr("uuid()"))
                        #df.printSchema()
                        # Load the dim_Organisation table to get the existing mappings
                        dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("external_id", "organisationkey")
                        # Perform a left join to find existing organisation keys
                        df_joined = df.alias("source").join(
                            dim_org_df.alias("dim"),
                            col("source.school_id") == col("dim.external_id"),
                            "left"
                        )
                        # Select all columns from df and only the 'organisationkey' from the dim_Organisation
                        df_with_keys = df_joined.select("source.*", col("dim.organisationkey").alias("dim_organisationkey"))
                        #df_with_keys.printSchema()
                        # Fill in the missing keys with UUIDs
                        df_complete = df_with_keys.withColumn(
                            "organisationkey",
                            when(col("dim_organisationkey").isNull(), expr("uuid()")).otherwise(col("dim_organisationkey"))
                        )
                        # Drop the 'dim_organisationkey' as it is no longer needed
                        df_final = df_complete.drop("dim_organisationkey")
                        #df_final.printSchema()
                        df_final.write.format("delta").mode("overwrite").save(silver_table_path)
                else: # if not the organisation or student table         
                    
                    if ('studentkey' in df.columns) and (delta_table_name != "dim_Student"):
                        # ------------------------------
                        # First, get the organisationkey
                        # ------------------------------
                        # Read the dim_Organisation table
                        dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("external_id", "organisationkey")
                        
                        df = df.drop("organisationkey")

                        # Perform a left join to find existing organisation keys
                        from pyspark.sql.functions import col
                        
                        df_joined = df.alias("source").join(
                            dim_org_df.alias("dim"),
                            col("source.school_id") == col("dim.external_id"),
                            "left"
                        )

                        # df_joined = df.drop("external_id")

                        # Select all columns from df (source) and only the 'organisationkey' from dim_Organisation (dim)
                        # Alias the dim_Organisation's organisationkey to avoid ambiguity
                        
                        df_with_orgkey = df_joined.select(
                            *[col(f"source.{col_name}") for col_name in df.columns],
                            col("dim.organisationkey")
                        )

                        # --------------------------
                        # second, get the studentkey
                        # --------------------------
                        
                        dim_student_df = spark.read.format("delta").load(f"{silver_path}/dim_Student").select("student_id", "organisationkey", "studentkey")
                        # Rename the 'studentkey' column from dim_student_df to avoid ambiguity
                        dim_student_df = dim_student_df.withColumnRenamed("studentkey", "dim_studentkey")
                        
                        # Perform a left join
                        df_studjoined = df_with_orgkey.alias("source").join(
                            dim_student_df.alias("dim"),
                            (trim(lower(col("source.student_id"))) == trim(lower(col("dim.student_id")))) &
                            (trim(lower(col("source.organisationkey"))) == trim(lower(col("dim.organisationkey")))),
                            "left"
                        )
                        
                        # Use when() to decide which studentkey to keep
                        df_both_keys = df_studjoined.withColumn("studentkey", 
                                                when(col("dim.dim_studentkey").isNull(), col("source.studentkey"))
                                                .otherwise(col("dim.dim_studentkey"))
                                                ) \
                                    .drop("dim.dim_studentkey") \
                                    .select("source.*", "studentkey")

                        df = df_both_keys
                        # if the studentkey lookup has failed to find a student record then remove the related record for referential integrity
                        df = df.filter((col("studentkey").isNotNull()) & (col("studentkey") != ""))
                        
                        # Wonde occasionally passes 2 records fo a sumamry attendance for a single student which would break
                        if delta_table_name == "fact_AttendanceSummary":
                            df = df.dropDuplicates(['studentkey'])
                        
                    # -------------------------------------------------------------------
                    # Now that any table with student_id in it has studentkey continue...
                    # -------------------------------------------------------------------

                    # Set the update columns to update everything other than organisationkey and the unique_key
                    update_columns = {col: f"source.{col}" for col in df.columns if col not in ['organisationkey', uuid_column_name]}
                    # print(update_columns)

                    if DeltaTable.isDeltaTable(spark, silver_table_path):
                        delta_table = DeltaTable.forPath(spark, silver_table_path)
                        target_df = delta_table.toDF()
                        
                        # Identify columns in source not in target
                        new_columns = set(df.columns) - set(target_df.columns)
                        
                        if new_columns:
                            # Add new columns with nulls to the target DataFrame
                            for new_col in new_columns:
                                target_df = target_df.withColumn(new_col, lit(None).cast(df.schema[new_col].dataType))
                            
                            # Create a new Delta table with the updated schema from the target DataFrame
                            new_table_path = silver_table_path + "_new"
                            
                            if DeltaTable.isDeltaTable(spark, new_table_path):
                                print(f"Table at {new_table_path} exists. Deleting...")
                                deltaTable = DeltaTable.forPath(spark, new_table_path)
                                deltaTable.delete()
                            
                            target_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(new_table_path)
                            
                            # Prepare the new Delta table for merging
                            new_delta_table = DeltaTable.forPath(spark, new_table_path)
                            
                            # Adjust the source DataFrame to match the target schema, including any new columns
                            df_adjusted = add_missing_columns(df, target_df)
                            
                            # Perform the merge operation
                            new_delta_table.alias("target").merge(
                                df_adjusted.alias("source"),
                                f"target.unique_key = source.unique_key"
                            ).whenMatchedUpdate(set=update_columns
                            ).whenNotMatchedInsertAll().execute()
                            
                            # Overwrite the old table with the new table's data
                            spark.read.format("delta").load(new_table_path).write.format("delta").option("overwriteSchema", "true").mode("overwrite").save(silver_table_path)
                            
                            # Consider cleaning up the new_table_path if necessary

                        else:
                            # Retrieve the schema of both DataFrames
                            target_columns = delta_table.toDF().columns
                            source_columns = df.columns
                            
                            # Identify columns that are in the target but not in the source
                            missing_columns = [col for col in target_columns if col not in source_columns]

                            # Add missing columns to the source DataFrame with default values (e.g., None)
                            for col in missing_columns:
                                df = df.withColumn(col, F.lit(None))

                            # Perform your normal merge operation here since there are no new columns
                            delta_table.alias("target").merge(
                                df.alias("source"),
                                f"target.unique_key = source.unique_key"
                            ).whenMatchedUpdate(set=update_columns
                            ).whenNotMatchedInsertAll().execute()

                    else:
                        # If the table does not exist, create it by writing the current DataFrame
                        # First, generate a UUID for all records in the new UUID column
                        df = df.withColumn(uuid_column_name, expr("uuid()"))

                        if ('studentkey' not in df.columns): # because we have already added organisationkey to that
                    
                            # Load the dim_Organisation table to get the existing mappings
                            dim_org_df = spark.read.format("delta").load(f"{silver_path}/dim_Organisation").select("external_id", "organisationkey")

                            # Perform a left join to find existing organisation keys
                            df_joined = df.alias("source").join(
                                dim_org_df.alias("dim"),
                                col("source.school_id") == col("dim.external_id"),
                                "left"
                            )

                            # Select all columns from df and only the 'organisationkey' from the dim_Organisation
                            # Alias the dim_Organisation's organisationkey to avoid ambiguity
                            df_with_keys = df_joined.select("source.*", col("dim.organisationkey").alias("dim_organisationkey"))
                            #df_with_keys.printSchema()

                            # Fill in the missing keys with UUIDs
                            # Ensure to use the aliased column name 'dim_organisationkey' to avoid ambiguity
                            df_complete = df_with_keys.withColumn(
                                "organisationkey",
                                when(col("dim_organisationkey").isNull(), expr("uuid()")).otherwise(col("dim_organisationkey"))
                            )

                            # Drop the 'dim_organisationkey' as it is no longer needed
                            df = df_complete.drop("dim_organisationkey")


                        # debug, show 20 records
                        df.show(n=20, truncate=False)

                        df.write.format("delta").mode("overwrite").save(silver_table_path)
        else:
            print(f"DataFrame for {json_name} is empty. Skipping processing.")             

In [None]:
from pyspark.sql.functions import coalesce, col, when, isnan, lit

# Update dim_StudentExtended for leavers
try:
    deltaTablePath = silver_path + "dim_StudentExtended"
    df = spark.read.format("delta").load(deltaTablePath)

    df_students_leaver = json_dfs['students_leaver.json']
    leaver_count = df_students_leaver.count()

    if leaver_count > 0:
        # Check if 'Leaving_Date' column exists, and if not, add it with null values
        if 'Leaving_Date' not in df.columns:
            df = df.withColumn('Leaving_Date', lit(None))

        if 'Leaving_Date' not in df_students_leaver.columns:
            df_students_leaver = df_students_leaver.withColumn('Leaving_Date', lit(None))

        #print(f"Number of records in 'students_leaver.json': {leaver_count}")

        # Join df_joined with df_students_leaver to update 'Leaving_Date' where matches are found
        df_joined = df.join(
                df_students_leaver.select('unique_key', 'Leaving_Date').withColumnRenamed('Leaving_Date', 'new_leaving_date'),
                on='unique_key',
                how='left'
            )

        # Modify the DataFrame to handle 'None' and 'nan'
        df_joined = df_joined.withColumn(
            'new_leaving_date', 
            when((col('new_leaving_date') != 'None') & (~isnan(col('new_leaving_date'))), col('new_leaving_date'))
            .otherwise(None)
        )

        # Apply coalesce to update 'Leaving_Date' only if 'new_leaving_date' has a valid (non-null) value
        df_joined = df_joined.withColumn(
            'Leaving_Date', 
            coalesce('new_leaving_date', 'Leaving_Date')
        ).drop('new_leaving_date')

        # Print distinct Leaving Dates and their counts
        distinct_leaving_dates = df_joined.select('Leaving_Date').distinct()
        #distinct_leaving_dates.show()
        counts_leaving_dates = df_joined.groupBy('Leaving_Date').count()
        #counts_leaving_dates.show()

        df_joined.write.format("delta").option("mergeSchema", "true").mode("overwrite").save(deltaTablePath)
    else:
        # If df_students_leaver is empty, log a message or perform alternative actions
        print("No records in 'students_leaver.json', skipping update.")

except Exception as e:
    print(f"An error occurred during the update operation: {e}")


In [None]:
from pyspark.sql import functions as F

# update dim_GroupMembership for groupkey
try:
    deltaTablePath = silver_path + "dim_Group"
    df = spark.read.format("delta").load(deltaTablePath).withColumnRenamed('unique_key', 'unique_group_id').withColumnRenamed('groupkey', 'new_groupkey')

    membershipTablePath = silver_path + "dim_GroupMembership"
    df_groupmembership = spark.read.format("delta").load(membershipTablePath)
    df_groupmembership = df_groupmembership.withColumn("unique_group_id", F.concat(F.col("school_id"), F.col("Group_ID")))

    # Perform the join
    df_joined = df_groupmembership.join(df.select('unique_group_id', 'new_groupkey'), on='unique_group_id', how='left')

    # Selecting columns from df_groupmembership and replacing 'groupkey' from the joined data
    columns_to_select = [F.col(c) for c in df_groupmembership.columns if c != 'groupkey'] + [F.col('new_groupkey').alias('groupkey')]
    df_updated = df_joined.select(*columns_to_select)

    # Drop the 'unique_group_id' if you no longer need it
    df_updated = df_updated.drop('unique_group_id')

    df_updated.write.format("delta").mode("overwrite").save(membershipTablePath)

except Exception as e:
    print(f"An error occurred during the update operation: {e}")

In [None]:
try:
    deltaTablePath = silver_path + "dim_Group"
    df = spark.read.format("delta").load(deltaTablePath)
    from pyspark.sql.functions import col
    df = df.filter(
        ~(
            col('groupkey').isNull() |  # Checks for null
            (col('groupkey') == "") |  # Checks for empty strings (valid for string types)
            (col('groupkey').cast("string") == "0")  # Casts to string to safely check for "0"
        )
    )
    df = df.dropDuplicates(['groupkey'])
    df.write.format("delta").mode("overwrite").save(deltaTablePath)
except Exception as e:
    print(f"An error occurred during the update operation: {e}")

In [None]:
# get the Year_Group

# Assuming data is loaded correctly into these dataframes
deltaTablePath = silver_path + "dim_Student"
df_student = spark.read.format("delta").load(deltaTablePath)
deltaTablePath = silver_path + "dim_GroupMembership"
df_groupmembership = spark.read.format("delta").load(deltaTablePath)

# Drop existing Year_Group column
df_student = df_student.drop("Year_Group")

# Join operation using both school_id and student_id, adding Year_Group
df_joined = df_student.alias("student").join(
df_groupmembership.alias("groupmembership"),
(col("student.school_id") == col("groupmembership.school_id")) &
(col("student.student_id") == col("groupmembership.student_id")) &
(col("groupmembership.Group_Type") == "YEAR"),
"left"
).select(
"student.*",  # Select all columns from df_student
col("groupmembership.Group_Name").alias("Year_Group")  # Rename and select the Group_Name as Year_Group
)

# Count the number of rows in the DataFrame
row_count = df_joined.count()

print(f"Number of rows in the DataFrame: {row_count}")

# Count the number of non-empty entries in the 'Year_Group' column
non_empty_year_group_count = df_joined.filter((col("Year_Group").isNotNull()) & (col("Year_Group") != "")).count()

print(f"Number of non-empty entries in the 'Year_Group' column: {non_empty_year_group_count}")

deltaTablePath = silver_path + "dim_Student"
df_joined.write.format("delta").mode("overwrite").save(deltaTablePath)

In [None]:
#Drop duplicates based on studentkey in StudentExtended

deltaTablePath = silver_path + "dim_StudentExtended"
df = spark.read.format("delta").load(deltaTablePath)

df = df.dropDuplicates(["studentkey"])

df.write.format("delta").mode("overwrite").save(deltaTablePath)


In [None]:
#Drop duplicates based on studentkey in Student

deltaTablePath = silver_path + "dim_Student"
df = spark.read.format("delta").load(deltaTablePath)

df = df.dropDuplicates(["studentkey"])

df.write.format("delta").mode("overwrite").save(deltaTablePath)


In [None]:
#Drop duplicates in attendancesummary

try:
    deltaTablePath = silver_path + "fact_AttendanceSummary"
    df_attendancesummary = spark.read.format("delta").load(deltaTablePath)
    df_attendancesummary = df_attendancesummary.dropDuplicates(['attendancesummarykey'])
    df_attendancesummary = df_attendancesummary.dropDuplicates(['studentkey'])
    df_attendancesummary.write.format("delta").mode("overwrite").save(deltaTablePath)

except Exception as e:
    print(f"An error occurred during the update operation: {e}")