In [None]:
from pyspark.sql.functions import col
"""
Provides data processing methods for Community Brands Xporter data.
Data is expected to be received via Xporter into oea/Transactional
The structure of the folders in oea(Primary) will then be something like:
    -> oea/transactional/xporter/[EstabId]
        ->  oea/transactional/xporter/[EstabId]/Schoolinfo.json
        -> oea/transactional/xporter/[EstabId]/Students.json
        etc
In stage1, everything is written to stage1/Transactional/xporter and stage2/[Ingested/refined]/xporter
"""
class Xporter():
    def __init__(self, source_folder='xporter', pseudonymize = True):
        self.set_oea_workspace('prod')
        self.schemas = {}
        self.schemas['schoolinfo'] = [
                                        ['CurrentAcademicYear', 'string', 'no-op'],
                                        ['DeniNo', 'string', 'no-op'],
                                        ['Email', 'string', 'no-op'],
                                        ['EstabId', 'string', 'no-op'],
                                        ['ExamCentre', 'string', 'no-op'],
                                        ['Governance', 'string', 'no-op'],
                                        ['Head', 'string', 'no-op'],
                                        ['Id', 'string', 'no-op'],
                                        ['LastUpdated', 'string', 'no-op'],
                                        ['MainContact', 'string', 'no-op'],
                                        ['Name', 'string', 'no-op'],
                                        ['Phase', 'string', 'no-op'],
                                        ['RowHash', 'string', 'mask'],
                                        ['SchoolLogoAlternateUrl', 'string', 'no-op'],
                                        ['SchoolLogoUrl', 'string', 'no-op'],
                                        ['Telephone', 'string', 'no-op'],
                                        ['Web', 'string', 'no-op'],
                                        ['SchoolKey', 'string', 'no-op'],
                                        ['SchoolID', 'string', 'hash'],
                                        ['Address', 'string', 'no-op']]

        self.schemas['students'] = [    ['AdmissionNo', 'string', 'no-op'],
                                        ['Apartment', 'string', 'no-op'],
                                        ['AsylumStatus', 'string', 'no-op'],
                                        ['AttMarksEndDate', 'string', 'no-op'],
                                        ['AttMarksStartDate', 'string', 'no-op'],
                                        ['Boarder', 'string', 'no-op'],
                                        ['BoardingHouse', 'string', 'no-op'],
                                        ['CandidateNo', 'string', 'no-op'],
                                        ['Country', 'string', 'no-op'],
                                        ['CountryOfBirth', 'string', 'no-op'],
                                        ['CountryOfBirthCode', 'string', 'no-op'],
                                        ['County', 'string', 'no-op'],
                                        ['DateofBirth', 'string', 'mask'],
                                        ['Destination', 'string', 'no-op'],
                                        ['DestinationStartDate', 'string', 'no-op'],
                                        ['Disabled', 'string', 'no-op'],
                                        ['DisplayName', 'string', 'no-op'],
                                        ['District', 'string', 'no-op'],
                                        ['EAL', 'string', 'no-op'],
                                        ['EnglishProficiencyLevel', 'string', 'no-op'],
                                        ['EnglishProficiencyLevelCode', 'string', 'no-op'],
                                        ['EnrolmentStatus', 'string', 'no-op'],
                                        ['EntryDate', 'string', 'no-op'],
                                        ['Ethnicity', 'string', 'no-op'],
                                        ['EthnicityCode', 'string', 'no-op'],
                                        ['EthnicitySource', 'string', 'no-op'],
                                        ['EverInCare', 'string', 'no-op'],
                                        ['ExternalId', 'string', 'no-op'],
                                        ['FSMEver6', 'string', 'no-op'],
                                        ['FirstLanguage', 'string', 'no-op'],
                                        ['FirstLanguageCode', 'string', 'no-op'],
                                        ['FirstLanguageSource', 'string', 'no-op'],
                                        ['Forename', 'string', 'no-op'],
                                        ['FsmEligible', 'string', 'no-op'],
                                        ['FsmStartDate', 'string', 'no-op'],
                                        ['FsmEndDate', 'string', 'no-op'],
                                        ['Gender', 'string', 'no-op'],
                                        ['Gifted', 'string', 'no-op'],
                                        ['HomeLanguage', 'string', 'no-op'],
                                        ['HomeLanguageCode', 'string', 'no-op'],
                                        ['HouseGroup', 'string', 'no-op'],
                                        ['HouseGroupId', 'string', 'no-op'],
                                        ['HouseName', 'string', 'no-op'],
                                        ['HouseNo', 'string', 'no-op'],
                                        ['Id', 'string', 'no-op'],
                                        ['IdaasEmail', 'string', 'no-op'],
                                        ['IdaasId', 'string', 'no-op'],
                                        ['InLeaCare', 'string', 'no-op'],
                                        ['IsTraveller', 'string', 'no-op'],
                                        ['IsYoungCarer', 'string', 'no-op'],
                                        ['KeyStage', 'string', 'no-op'],
                                        ['LastUpdated', 'string', 'no-op'],
                                        ['LeaCareAuthority', 'string', 'no-op'],
                                        ['LeavingDate', 'string', 'no-op'],
                                        ['LeavingRegGroup', 'string', 'no-op'],
                                        ['LeavingYearGroup', 'string', 'no-op'],
                                        ['LegalForename', 'string', 'no-op'],
                                        ['LegalSurname', 'string', 'no-op'],
                                        ['Marks', 'string', 'no-op'],
                                        ['MiddleName', 'string', 'no-op'],
                                        ['ModeOfTravel', 'string', 'no-op'],
                                        ['NCYear', 'string', 'no-op'],
                                        ['NationalIdentity', 'string', 'no-op'],
                                        ['Nationality', 'string', 'no-op'],
                                        ['OnReport', 'string', 'no-op'],
                                        ['ParentalSalutation', 'string', 'no-op'],
                                        ['PartTime', 'string', 'no-op'],
                                        ['PostCode', 'string', 'no-op'],
                                        ['PreviousLegalSurname', 'string', 'no-op'],
                                        ['PupilPremium', 'string', 'no-op'],
                                        ['QuickNote', 'string', 'no-op'],
                                        ['ReasonForLeaving', 'string', 'no-op'],
                                        ['RegGroup', 'string', 'no-op'],
                                        ['RegGroupId', 'string', 'no-op'],
                                        ['Religion', 'string', 'no-op'],
                                        ['ReligionCode', 'string', 'no-op'],
                                        ['RowHash', 'string', 'no-op'],
                                        ['SENProvision', 'string', 'no-op'],
                                        ['ServiceChild', 'string', 'no-op'],
                                        ['ServiceChildSource', 'string', 'no-op'],
                                        ['StandardYearGroupCode', 'string', 'no-op'],
                                        ['StandardYearGroupName', 'string', 'no-op'],
                                        ['Street', 'string', 'no-op'],
                                        ['StudentStatus', 'string', 'no-op'],
                                        ['Surname', 'string', 'no-op'],
                                        ['TownOrCity', 'string', 'no-op'],
                                        ['TravellerSource', 'string', 'no-op'],
                                        ['UPN', 'string', 'no-op'],
                                        ['UniformAllowance', 'string', 'no-op'],
                                        ['UniqueLearnerNumber', 'string', 'no-op'],
                                        ['WorkEmail', 'string', 'no-op'],
                                        ['XID', 'string', 'no-op'],
                                        ['YSSA', 'string', 'no-op'],
                                        ['YearGroup', 'string', 'no-op'],
                                        ['YearGroupId', 'string', 'no-op'],
                                        ['YearTaughtIn', 'string', 'no-op'],
                                        ['formerUPN', 'string', 'no-op'],
                                        ['SchoolID', 'string', 'no-op'],
                                        ['AddressBlock', 'string', 'no-op'],
                                        ['UniqueStudentId', 'string', 'hash']
                                        ] 


        self.schemas['attendancesummary'] = [['Id', 'string', 'no-op'],
                                            ['XID', 'string', 'no-op'],
                                            ['MIS_ID', 'string', 'no-op'],
                                            ['IdaasId', 'string', 'no-op'],
                                            ['AttStatsStartDate', 'string', 'no-op'],
                                            ['AttStatsEndDate', 'string', 'no-op'],
                                            ['NumPossMarks', 'string', 'no-op'],
                                            ['NumPresMarks', 'string', 'no-op'],
                                            ['NumAEAMarks', 'string', 'no-op'],
                                            ['NumAuthAbsMarks', 'string', 'no-op'],
                                            ['NumUnauthAbsMarks', 'string', 'no-op'],
                                            ['NumMissMarks', 'string', 'no-op'],
                                            ['NumLateMarks', 'string', 'no-op'],
                                            ['NumLateBeforeRegMarks', 'string', 'no-op'],
                                            ['Marks', 'string', 'mask'],
                                            ['UniqueAttendanceId', 'string', 'hash'],
                                            ['SchoolID', 'string', 'no-op']] 

        self.schemas['groups'] = [['Code', 'string', 'no-op'],
                                ['ExternalId', 'string', 'no-op'],
                                ['Id', 'string', 'no-op'],
                                ['IdaasId', 'string', 'no-op'],
                                ['LastUpdated', 'string', 'no-op'],
                                ['Name', 'string', 'no-op'],
                                ['NumStudents', 'string', 'no-op'],
                                ['PlatformId', 'string', 'no-op'],
                                ['PrimaryStaffId', 'string', 'no-op'],
                                ['RowHash', 'string', 'mask'],
                                ['Staff', 'string', 'no-op'],
                                ['Type', 'string', 'no-op'],
                                ['XID', 'string', 'no-op'],
                                ['SchoolID', 'string', 'no-op'],
                                ['UniqueGroupId', 'string', 'hash']] 

        self.schemas['HistoricalAttendanceSummary'] = [['EndDate', 'string', 'no-op'],
                                                ['Id','string', 'no-op'],
                                                ['SchoolYear', 'string', 'no-op'],
                                                ['StartDate', 'string', 'no-op'],
                                                ['StudentId', 'string', 'no-op'],
                                                ['SchoolID', 'string', 'no-op'],
                                                ['UniqueHistoricalAttendanceId', 'string', 'hash'],
                                                ['Marks', 'string', 'no-op']]

        self.schemas['staff'] = [["Apartment","string", 'no-op'],
                                ["Country","string", 'no-op'],
                                ["County","string", 'no-op'],
                                ["DateOfBirth", "string", 'no-op'],
                                ["DisplayName", 'string', 'no-op'],
                                ["District","string", 'no-op'],
                                ["EmploymentEnd","string", 'no-op'],
                                ["EmploymentStart", 'string', 'no-op'],
                                ["ExternalId","string", 'no-op'],
                                ["Forename","string", 'no-op'],
                                ["Gender","string", 'no-op'],
                                ["HomeEmail", 'string', 'no-op'],
                                ["HomePhone", 'string', 'no-op'],
                                ["HouseName", 'string', 'no-op'],
                                ["HouseNo", 'string', 'no-op'],
                                ["Id", 'string', 'no-op'],
                                ["IdaasEmail", "string", 'no-op'],
                                ["IdaasId", "string", 'no-op'],
                                ["IsSupply", "string", 'no-op'],
                                ["IsSupport", 'string', 'no-op'],
                                ["IsTeacher", "string", 'no-op'],
                                ["LastUpdated", "string", 'no-op'],
                                ["LegalForename", "string", 'no-op'],
                                ["LegalSurname", 'string', 'no-op'],
                                ["MiddleName", "string", 'no-op'],
                                ["MobilePhone", "string", 'no-op'],
                                ["NINumber", "string", 'no-op'],
                                ["PayrollNumber", "string", 'no-op'],
                                ["PostCode", 'string', 'no-op'],
                                ["RegGroup", "string", 'no-op'],
                                ["RoleCodes", "string", 'no-op'],
                                ["Roles", 'string', 'no-op'],
                                ["RowHash", 'string', 'mask'],
                                ["StaffCode", "string", 'no-op'],
                                ["StaffStatus", 'string', 'no-op'],
                                ["Street", "string", 'no-op'],
                                ["Suffix", 'string', 'no-op'],
                                ["Surname", "string", 'no-op'],
                                ["TeacherCategory", "string", 'no-op'],
                                ["TeacherNumber", 'string', 'no-op'],
                                ["Title", 'string', 'no-op'],
                                ["TownOrCity", 'string', 'no-op'],
                                ["WorkEmail", 'string', 'no-op'],
                                ["WorkPhone", "string", 'no-op'],
                                ["XID", "string", 'no-op'],
                                ['SchoolID', 'string', 'no-op'],
                                ['UniqueStaffId','string','hash'],
                                ["AddressBlock",'string', 'no-op']]


        self.schemas['StudentMembers'] = [['EndDate', 'string', 'no-op'],
                                        ['GroupExternalId', 'string', 'no-op'],
                                        ['GroupId', 'string', 'no-op'],
                                        ['GroupIdaasId', 'string', 'no-op'],
                                        ['Id', 'string', 'no-op'],
                                        ['LastUpdated', 'string', 'no-op'],
                                        ['RowHash', 'string', 'mask'],
                                        ['StartDate', 'string', 'no-op'],
                                        ['StudentExternalId', 'string', 'no-op'],
                                        ['StudentId', 'string', 'no-op'],
                                        ['StudentIdaasId', 'string', 'no-op'],
                                        ['SchoolID', 'string', 'no-op'],
                                        ['UniqueStudentId', 'string', 'no-op'],
                                        ['UniqueGroupId', 'string', 'no-op'],
                                        ['UniqueStudentMemberId', 'string', 'hash']]
 
    def set_oea_workspace(self, workspace_name):
        oea.set_workspace(workspace_name)
    
    def json_from_xporter(self, source_path, multiline):
        print(source_path)
        options = {'format':'json', 'multiline':multiline}
        df = spark.read.load(source_path, **options)
        return df
   
    def get_oea_path(self):
        return 'abfss://oea@' + oea.storage_account + '.dfs.core.windows.net'

    def land_to_stage3(self, data, entity_path, filename):
        sink_path = f'stage3/Transactional/{entity_path}/{filename}'
        oea.write(data, sink_path)
        return sink_path

    def overwrite_to_path(self, df, destination_path, save_format = "parquet", primary_key='id'):
        destination_url = oea.to_url(destination_path)
        df = df.dropDuplicates([primary_key])
        df.write.format(save_format).mode('overwrite').save(destination_url)

    def _prepare_schoolinfo(self):
        from pyspark.sql.functions import lit
        #oea.rm_if_exists(oea.stage1np + '/xporter/schoolinfocsv')
        df_schoolinfo = None
        # loop through school EstabId folders landed by Xporter and union schoolinfo
        for folder in oea.get_folders(self.get_oea_path() + '/Transactional/xporter'):
            if folder.isnumeric():
                print(folder)
                try:
                    xporterPath = self.get_oea_path()+'/Transactional/xporter/'+folder+'/SchoolInfo*'
                    print(xporterPath)
                    df = self.json_from_xporter(xporterPath, multiline = True)
                    df.show()
                    dfc = df.select(F.explode('SchoolInfo').alias('exploded_values')).select("exploded_values.*")
                    dfc.show()
                    dfc = dfc.drop('Address')
                    newdf = dfc.withColumn("Address",col("RowHash"))
                    newdf = newdf.withColumn('SchoolID',lit(folder))
                    newdf = newdf.withColumn('SchoolKey',lit(folder))
                    if df_schoolinfo is None:
                        df_schoolinfo = newdf
                    else:
                        df_schoolinfo = df_schoolinfo.union(newdf)
                except:
                    pass
        
        csvString = df_schoolinfo.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/SchoolInfo', 'SchoolInfo.csv', oea.DELTA_BATCH_DATA)

        df = oea.load_csv(f'stage1/Transactional/xporter/SchoolInfo')
        display(df)        
        
    def ingest_schoolinfo(self):
        oea.ingest(f'xporter/SchoolInfo', 'SchoolID')
        oea.refine('xporter/SchoolInfo', self.schemas['schoolinfo'], 'SchoolID')


    def ingest_schoolinfo_stage3(self, ingestDatabaseName, refineDatabaseName):
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.schoolinfo")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.schoolinfo_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/schoolinfo_ingested',save_format = "parquet", primary_key='SchoolID')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/schoolinfo_refined',save_format = "parquet", primary_key='SchoolID')
        
    
    def _prepare_students(self):
        from pyspark.sql.functions import lit, concat, col, isnull, trim, length
        #oea.rm_if_exists(oea.stage1np + '/xporter/schoolinfocsv')
        df_studinfo = None
        # loop through school EstabId folders landed by Xporter and union schoolinfo
        for folder in oea.get_folders(self.get_oea_path() + '/Transactional/xporter'):
            if folder.isnumeric():
                print(folder)
                try:
                    xporterPath = self.get_oea_path()+'/Transactional/xporter/'+folder+'/Students*'
                    df = self.json_from_xporter(xporterPath, multiline = True)
                    dfc = df.select(F.explode('Students').alias('exploded_values')).select("exploded_values.*")
                    dfc = dfc.drop('AddressBlock')
                    dfc = dfc.withColumn("AddressBlock",lit("undefined"))
                    dfc = dfc.withColumn('SchoolID',lit(folder))
                    newdf = dfc.withColumn("UniqueStudentId", concat(col("SchoolID"), lit("_"), col("IdaasId")))
                    newdf.show()
                    if df_studinfo is None:
                        df_studinfo = newdf
                    else:
                        df_studinfo = df_studinfo.union(newdf)
                except:
                    pass
        
        print('df_studinfo')
        df_studinfo.show()
        df_studinfo = df_studinfo.filter(col("UPN") != "undefined")
        df_studinfo = df_studinfo.filter(~isnull(col("UPN")))
        df_studinfo = df_studinfo.filter((length(trim(col("UPN"))) > 9))
        csvString = df_studinfo.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/Students', 'Students.csv', oea.DELTA_BATCH_DATA)
        df = oea.load_csv(f'stage1/Transactional/xporter/Students')
        display(df)
    

    def ingest_students(self):
        oea.ingest(f'xporter/Students', 'UniqueStudentId')
        oea.refine('xporter/Students', self.schemas['students'], 'UniqueStudentId')
    
    
    def ingest_students_stage3(self, ingestDatabaseName, refineDatabaseName):
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.students")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.students_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/students_ingested',save_format = "parquet", primary_key='UniqueStudentId')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/students_refined',save_format = "parquet", primary_key='UniqueStudentId')


    def _prepare_attendancesummary(self):
        from pyspark.sql.functions import lit, concat, col, isnull, trim, length
        df_attendancesummary = None
        for folder in oea.get_folders(xporter.get_oea_path() + '/Transactional/xporter'):
            if folder.isnumeric():
                print(folder)
                try:
                    xporterPath = xporter.get_oea_path()+'/Transactional/xporter/'+folder+'/AttendanceSummary*'
                    new_df = xporter.json_from_xporter(xporterPath, multiline = True)
                    new_df = new_df.select(F.explode('AttendanceSummary').alias('exploded_values')).select("exploded_values.*")
                    new_df = new_df.drop('AddressBlock')
                    new_df = new_df.withColumn("AddressBlock",lit("undefined"))
                    new_df = new_df.withColumn('SchoolID',lit(folder)) 
                    new_df = new_df.withColumn("UniqueAttendanceId", concat(col("SchoolID"), lit("_"), col("IdaasId")))           
                    if df_attendancesummary is None:
                        df_attendancesummary = new_df
                    else:
                        df_attendancesummary = df_attendancesummary.union(new_df)
                except:
                    pass

        print(df_attendancesummary)
        df_attendancesummary.show()   
        csvString = df_attendancesummary.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/AttendanceSummary', 'attendancesummary.csv', oea.DELTA_BATCH_DATA)
        df = oea.load_csv(f'stage1/Transactional/xporter/AttendanceSummary')
        display(df)

    def ingest_attendancesummary(self):
        oea.ingest(f'xporter/AttendanceSummary', 'UniqueAttendanceId')
        oea.refine('xporter/AttendanceSummary', self.schemas['attendancesummary'], 'UniqueAttendanceId')
    
    def ingest_attendancesummary_stage3(self, ingestDatabaseName, refineDatabaseName):
        """ Processes delta batch data from stage2 into stage3 """
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.attendancesummary")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.attendancesummary_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/attendancesummary_ingested',save_format = "parquet", primary_key='UniqueAttendanceId')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/attendancesummary_refined',save_format = "parquet", primary_key='UniqueAttendanceId')

    
    def _prepare_groups(self):
        from pyspark.sql.functions import lit, concat, col, isnull, trim, length
        df_groups1 = None
        for folder in oea.get_folders(self.get_oea_path() + '/Transactional/xporter'):
            print(folder)
            if folder.isnumeric():
                try:
                    xporterPath = self.get_oea_path()+'/Transactional/xporter/'+folder+'/groups*'
                    new_df = self.json_from_xporter(xporterPath, multiline = True)
                    new_df = new_df.select(F.explode('Group').alias('exploded_values')).select("exploded_values.*")
                    new_df = new_df.withColumn('SchoolID',lit(folder)) 
                    new_df = new_df.withColumn("UniqueGroupId", concat(col("SchoolID"), lit("_"), col("IdaasId")))           
                    if df_groups1 is None:
                        df_groups1 = new_df
                    else:
                        df_groups1 = df_groups1.union(new_df)
                except:
                    pass

        csvString = df_groups1.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/Groups', 'Groups.csv', oea.DELTA_BATCH_DATA)
        df = oea.load_csv(f'stage1/Transactional/xporter/Groups')
        display(df)
        


    def ingest_groups(self):
        oea.ingest(f'xporter/Groups', 'UniqueGroupId')
        oea.refine('xporter/Groups', self.schemas['groups'], 'UniqueGroupId')
    
    def ingest_groups_stage3(self, ingestDatabaseName, refineDatabaseName):
        """ Processes delta batch data from stage2 into stage3 """
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.groups")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.groups_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/groups_ingested',save_format = "parquet", primary_key='UniqueGroupId')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/groups_refined',save_format = "parquet", primary_key='UniqueGroupId')

    
    def _prepare_HistoricalAttendanceSummary(self):
        from pyspark.sql.functions import lit, concat, col, isnull, trim, length
        df_histattendancSummary = None
        for folder in oea.get_folders(self.get_oea_path() + '/Transactional/xporter'):
            if folder.isnumeric():
                print(folder)
            try:
                xporterPath = self.get_oea_path()+'/Transactional/xporter/'+folder+'/HistoricalAttendanceSummary*'
                new_df = self.json_from_xporter(xporterPath, multiline = True)
                new_df = new_df.select(F.explode('HistoricalAttendanceSummary').alias('exploded_values')).select("exploded_values.*")
                new_df = new_df.drop('AddressBlock')
                new_df = new_df.withColumn("AddressBlock",lit("undefined"))
                new_df = new_df.withColumn('SchoolID',lit(folder)) 
                new_df = new_df.withColumn("UniqueHistoricalAttendanceId", concat(col("SchoolID"), lit("_"), col("Id")))           
                if df_histattendancSummary is None:
                    df_histattendancSummary = new_df
                else:
                    df_histattendancSummary = df_histattendancSummary.union(new_df)
            except:
                pass

        csvString = df_histattendancSummary.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/HistoricalAttendanceSummary', 'HistoricalAttendanceSummary.csv', oea.DELTA_BATCH_DATA)
        df = oea.load_csv(f'stage1/Transactional/xporter/Students')
        display(df)



    def ingest_HistoricalAttendanceSummary(self):
        oea.ingest(f'xporter/HistoricalAttendanceSummary', 'UniqueHistoricalAttendanceId')
        oea.refine('xporter/HistoricalAttendanceSummary', self.schemas['HistoricalAttendanceSummary'], 'UniqueHistoricalAttendanceId')#ingesting into stage 2"""#ingesting into stage 2"""
    
    
    def ingest_HistoricalAttendanceSummary_stage3(self, ingestDatabaseName, refineDatabaseName):
        """ Processes delta batch data from stage2 into stage3 """
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.historicalattendancesummary")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.historicalattendancesummary_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/historicalattendancesummary_ingested',save_format = "parquet", primary_key='UniqueHistoricalAttendanceId')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/historicalattendancesummary_refined',save_format = "parquet", primary_key='UniqueHistoricalAttendanceId')
   

    def _prepare_staff(self):
        from pyspark.sql.functions import lit, concat, col, isnull, trim, length
        df_staff = None
        for folder in oea.get_folders(self.get_oea_path() + '/Transactional/xporter'):
            print(folder)
            if folder.isnumeric():
                try:
                    xporterPath = self.get_oea_path()+'/Transactional/xporter/'+folder+'/staff*'
                    new_df = self.json_from_xporter(xporterPath, multiline = True)
                    new_df = new_df.select(F.explode('staff').alias('exploded_values')).select("exploded_values.*")
                    new_df = new_df.withColumn('SchoolID',lit(folder)) 
                    new_df = new_df.drop('AddressBlock')
                    new_df = new_df.withColumn("AddressBlock",lit("undefined"))
                    new_df = new_df.withColumn("UniqueStaffId", concat(col("SchoolID"), lit("_"), col("IdaasId")))           
                    if df_staff is None:
                        df_staff = new_df
                    else:
                        df_staff = df_staff.union(new_df)
                except:
                    pass

        csvString = df_staff.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/Staff', 'Staff.csv', oea.DELTA_BATCH_DATA)
        df = oea.load_csv(f'stage1/Transactional/xporter/Staff')
        display(df)

    
    def ingest_staff(self):
        oea.ingest(f'xporter/Staff', 'UniqueStaffId')
        oea.refine('xporter/Staff', self.schemas['staff'], 'UniqueStaffId')
    
    
    def ingest_staff_stage3(self, ingestDatabaseName, refineDatabaseName):
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.staff")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.staff_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/staff_ingested',save_format = "parquet", primary_key='UniqueStaffId')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/staff_refined',save_format = "parquet", primary_key='UniqueStaffId')
    
    def _prepare_StudentMembers(self):
        from pyspark.sql.functions import lit, concat, col, isnull, trim, length
        df_studentmembers = None
        for folder in oea.get_folders(self.get_oea_path() + '/Transactional/xporter'):
            print(folder)
            if folder.isnumeric():
                try:
                    xporterPath = self.get_oea_path()+'/Transactional/xporter/'+folder+'/groups*'
                    new_df = self.json_from_xporter(xporterPath, multiline = True)
                    new_df = new_df.select(F.explode('StudentMembers').alias('exploded_values')).select("exploded_values.*")
                    new_df = new_df.withColumn('SchoolID',lit(folder)) 
                    new_df = new_df.withColumn("UniqueStudentId", concat(col("SchoolID"), lit("_"), col("StudentIdaasId")))
                    new_df = new_df.withColumn("UniqueGroupId", concat(col("SchoolID"), lit("_"), col("GroupIdaasId")))
                    new_df = new_df.withColumn("UniqueStudentMemberId", concat(col("SchoolID"), lit("_"), col("Id")))           
                    if df_studentmembers is None:
                        df_studentmembers = new_df
                    else:
                        df_studentmembers = df_studentmembers.union(new_df)
                except:
                    pass

        csvString = df_studentmembers.toPandas().to_csv(index=False)
        oea.land(csvString, 'xporter/StudentMembers', 'StudentMembers.csv', oea.DELTA_BATCH_DATA)
        df = oea.load_csv(f'stage1/Transactional/xporter/StudentMembers')
        display(df)

    
    def ingest_StudentMembers(self):
        oea.ingest(f'xporter/StudentMembers', 'UniqueStudentMemberId')
        oea.refine('xporter/StudentMembers', self.schemas['StudentMembers'], 'UniqueStudentMemberId')

    
    def ingest_StudentMembers_stage3(self, ingestDatabaseName, refineDatabaseName):
        """ Processes delta batch data from stage2 into stage3 """
        df_ingested = spark.sql(f"select * from {ingestDatabaseName}.studentmembers")
        display(df_ingested)
        df_ingested.printSchema()
        df_refined = spark.sql(f"select * from {refineDatabaseName}.studentmembers_lookup")
        display(df_refined)
        #writing files in parquet format
        xporter.overwrite_to_path(df_ingested,f'stage3/xporter/studentmembers_ingested',save_format = "parquet", primary_key='UniqueStudentMemberId')
        xporter.overwrite_to_path(df_refined,f'stage3/xporter/studentmembers_refined',save_format = "parquet", primary_key='UniqueStudentMemberId')
        
xporter = Xporter()