In [None]:
class ContosoSIS(BaseOEAModule):
    def __init__(self, oea, source_folder='contoso_sis', pseudonymize = True):
        BaseOEAModule.__init__(self, oea, source_folder, pseudonymize)
        self.schemas['studentattendance'] = [['id', 'string', 'no-op'],
                                            ['student_id', 'string', 'hash-no-lookup'],
                                            ['school_year', 'integer', 'no-op'],
                                            ['school_id', 'string', 'no-op'],
                                            ['attendance_date', 'timestamp', 'no-op'],
                                            ['all_day', 'string', 'no-op'],
                                            ['Period', 'short', 'no-op'],
                                            ['section_id', 'string', 'no-op'],
                                            ['AttendanceCode', 'string', 'no-op'],
                                            ['PresenceFlag', 'boolean', 'no-op'],
                                            ['attendance_status', 'string', 'no-op'],
                                            ['attendance_type', 'string', 'no-op'],
                                            ['attendance_sequence', 'short', 'no-op']]

        self.schemas['studentsectionmark'] = [['id', 'string', 'no-op'],
                                            ['student_id', 'string', 'hash-no-lookup'],
                                            ['section_id', 'string', 'no-op'],
                                            ['school_year', 'string', 'no-op'],
                                            ['term_id', 'string', 'no-op'],
                                            ['numeric_grade_earned', 'short', 'no-op'],
                                            ['alpha_grade_earned', 'string', 'no-op'],
                                            ['is_final_grade', 'string', 'no-op'],
                                            ['credits_attempted', 'short', 'no-op'],
                                            ['credits_earned', 'short', 'no-op'],
                                            ['grad_credit_type', 'string', 'no-op']]
                                            
    def process_data_from_stage1(self):
        self._process_entity_from_stage1('studentattendance', 'csv', 'overwrite', 'true')
        self._process_entity_from_stage1('studentsectionmark', 'csv', 'overwrite', 'true')

    def copy_test_data_to_stage1(self):
        mssparkutils.fs.cp(self.module_path + '/test_data/studentattendance.csv', self.stage1np + '/studentattendance/studentattendance.csv', True)
        mssparkutils.fs.cp(self.module_path + '/test_data/studentsectionmark.csv', self.stage1np + '/studentsectionmark/studentsectionmark.csv', True)

class M365(BaseOEAModule):
    """
    Provides data processing methods for MS Insights data v0.2 format.
    """

    def __init__(self, oea, source_folder='m365'):
        BaseOEAModule.__init__(self, oea, source_folder)

        self.stage1np_activity = self.stage1np + '/DIPData/Activity/ApplicationUsage'
        self.stage1np_roster = self.stage1np + '/DIPData/Roster'

        self.schemas['Activity0p2'] = [['SignalType', 'string', 'no-op'],
                                            ['StartTime', 'timestamp', 'no-op'],
                                            ['UserAgent', 'string', 'no-op'],
                                            ['SignalId', 'string', 'no-op'],
                                            ['SISClassId', 'string', 'no-op'],
                                            ['OfficeClassId', 'string', 'no-op'],
                                            ['ChannelId', 'string', 'no-op'],
                                            ['AppName', 'string', 'no-op'],
                                            ['ActorId', 'string', 'hash-no-lookup'],
                                            ['ActorRole', 'string', 'no-op'],
                                            ['SchemaVersion', 'string', 'no-op'],
                                            ['AssignmentId', 'string', 'no-op'],
                                            ['SubmissionId', 'string', 'no-op'],
                                            ['Action', 'string', 'no-op'],
                                            ['AssginmentDueDate', 'string', 'no-op'],
                                            ['ClassCreationDate', 'string', 'no-op'],
                                            ['Grade', 'string', 'no-op'],
                                            ['SourceFileExtension', 'string', 'no-op'],
                                            ['MeetingDuration', 'string', 'no-op']]
        self.schemas['Calendar'] = [['Id', 'string', 'no-op'],
                                            ['Name', 'string', 'no-op'],
                                            ['Description', 'string', 'no-op'],
                                            ['SchoolYear', 'integer', 'no-op'],
                                            ['IsCurrent', 'boolean', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['OrgId', 'string', 'no-op']]
        self.schemas['Course'] = [['Id', 'string', 'no-op'],
                                            ['Name', 'string', 'no-op'],
                                            ['Code', 'string', 'no-op'],
                                            ['Description', 'string', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['CalendarId', 'string', 'no-op']]
        self.schemas['Org'] = [['Id', 'string', 'no-op'],
                                            ['Name', 'string', 'no-op'],
                                            ['Identifier', 'string', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['ParentOrgId', 'string', 'no-op'],
                                            ['RefOrgTypeId', 'string', 'no-op'],
                                            ['SourceSystemId', 'string', 'no-op']]
        self.schemas['Person'] = [['Id', 'string', 'hash'],
                                            ['FirstName', 'string', 'mask'],
                                            ['MiddleName', 'string', 'mask'],
                                            ['LastName', 'string', 'mask'],
                                            ['GenerationCode', 'string', 'no-op'],
                                            ['Prefix', 'string', 'no-op'],
                                            ['EnabledUser', 'string', 'no-op'],
                                            ['ExternalId', 'string', 'hash'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['SourceSystemId', 'string', 'no-op']]
        self.schemas['PersonIdentifier'] = [['Id', 'string', 'hash'],
                                            ['Identifier', 'string', 'hash'],
                                            ['Description', 'string', 'no-op'],
                                            ['RefIdentifierTypeId', 'string', 'no-op'],
                                            ['ExternalId', 'string', 'hash'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['PersonId', 'string', 'hash'],
                                            ['SourceSystemId', 'string', 'no-op']]
        self.schemas['RefDefinition'] = [['Id', 'string', 'no-op'],
                                            ['RefType', 'string', 'no-op'],
                                            ['Namespace', 'string', 'no-op'],
                                            ['Code', 'string', 'no-op'],
                                            ['SortOrder', 'integer', 'no-op'],
                                            ['Description', 'string', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op']]
        self.schemas['Section'] = [['Id', 'string', 'no-op'],
                                            ['Name', 'string', 'no-op'],
                                            ['Code', 'string', 'no-op'],
                                            ['Location', 'string', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['CourseId', 'string', 'no-op'],
                                            ['RefSectionTypeId', 'string', 'no-op'],
                                            ['SessionId', 'string', 'no-op'],
                                            ['OrgId', 'string', 'no-op']]
        self.schemas['Session'] = [['Id', 'string', 'no-op'],
                                            ['Name', 'string', 'no-op'],
                                            ['BeginDate', 'timestamp', 'no-op'],
                                            ['EndDate', 'timestamp', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['CalendarId', 'string', 'no-op'],
                                            ['ParentSessionId', 'string', 'no-op'],
                                            ['RefSessionTypeId', 'string', 'no-op']]
        self.schemas['StaffOrgAffiliation'] = [['Id', 'string', 'no-op'],
                                            ['IsPrimary', 'boolean', 'no-op'],
                                            ['EntryDate', 'timestamp', 'no-op'],
                                            ['ExitDate', 'timestamp', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['OrgId', 'string', 'no-op'],
                                            ['PersonId', 'string', 'hash'],
                                            ['RefStaffOrgRoleId', 'string', 'no-op']]
        self.schemas['StaffSectionMembership'] = [['Id', 'string', 'no-op'],
                                            ['IsPrimaryStaffForSection', 'boolean', 'no-op'],
                                            ['EntryDate', 'timestamp', 'no-op'],
                                            ['ExitDate', 'timestamp', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['PersonId', 'string', 'hash'],
                                            ['RefStaffSectionRoleId', 'string', 'no-op'],
                                            ['SectionId', 'string', 'no-op']]
        self.schemas['StudentOrgAffiliation'] = [['Id', 'string', 'no-op'],
                                            ['IsPrimary', 'boolean', 'no-op'],
                                            ['EntryDate', 'timestamp', 'no-op'],
                                            ['ExitDate', 'timestamp', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['OrgId', 'string', 'no-op'],
                                            ['PersonId', 'string', 'hash'],
                                            ['RefGradeLevelId', 'string', 'no-op'],
                                            ['RefStudentOrgRoleId', 'string', 'no-op'],
                                            ['RefEnrollmentStatusId', 'string', 'no-op']]
        self.schemas['StudentSectionMembership'] = [['Id', 'string', 'no-op'],
                                            ['EntryDate', 'timestamp', 'no-op'],
                                            ['ExitDate', 'timestamp', 'no-op'],
                                            ['ExternalId', 'string', 'no-op'],
                                            ['CreateDate', 'timestamp', 'no-op'],
                                            ['LastModifiedDate', 'timestamp', 'no-op'],
                                            ['IsActive', 'boolean', 'no-op'],
                                            ['PersonId', 'string', 'hash'],
                                            ['RefGradeLevelWhenCourseTakenId', 'string', 'no-op'],
                                            ['RefStudentSectionRoleId', 'string', 'no-op'],
                                            ['SectionId', 'string', 'no-op']]
    
    def process_activity_data_from_stage1(self):
        """ Processes activity data from stage1 into stage2 using structured streaming. 
            https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
        """
        logger.info("Processing ms_insights activity data from: " + self.stage1np_activity)

        spark_schema = self.oea.to_spark_schema(self.schemas['Activity0p2'])
        df = spark.read.csv(self.stage1np_activity + '/*.csv', header='false', schema=spark_schema) 
        sqlContext.registerDataFrameAsTable(df, 'Activity')
        sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(self.oea.stage2np + '/m365/PersonIdentifier'), 'PersonIdentifier')
        sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(self.oea.stage2np + '/m365/RefDefinition'), 'RefDefinition')

        df = spark.sql( 
            "select act.SignalType, act.StartTime, act.UserAgent, act.SignalId, act.SISClassId, act.OfficeClassId, act.ChannelId, \
            act.AppName, act.ActorId, act.ActorRole, act.SchemaVersion, act.AssignmentId, act.SubmissionId, act.Action, act.AssginmentDueDate, \
            act.ClassCreationDate, act.Grade, act.SourceFileExtension, act.MeetingDuration, pi.PersonId \
            from PersonIdentifier pi, RefDefinition rd, Activity act \
            where \
                pi.RefIdentifierTypeId = rd.Id \
                and rd.RefType = 'RefIdentifierType' \
                and rd.Code = 'ActiveDirectoryId' \
                and pi.Identifier = act.ActorId")

        df = df.dropDuplicates(['SignalId'])
        df = df.withColumn('year', F.year(F.col('StartTime'))).withColumn('month', F.month(F.col('StartTime')))
        df = self.oea.fix_column_names(df)
        df.write.format('parquet').mode('overwrite').option("mergeSchema", "true").save(self.stage2np + '/TechActivity')

    def reset_activity_processing(self):
        """ Resets all TechActivity processing. This is intended for use during initial testing - use with caution. """
        self.oea.rm_if_exists(self.stage2p + '/TechActivity')
        self.oea.rm_if_exists(self.stage2np + '/TechActivity')
        logger.info(f"Deleted TechActivity from stage2")  

    def _process_roster_entity(self, path):
        try:
            base_path, filename = self.oea.pop_from_path(path)
            entity = filename[:-4]
            logger.debug(f"Processing roster entity: path={path}, entity={entity}")
            spark_schema = self.oea.to_spark_schema(self.schemas[entity])
            df = spark.read.csv(path, header='false', schema=spark_schema)
            df = self.oea.fix_column_names(df)
            df.write.format('parquet').mode('overwrite').option("mergeSchema", "true").save(self.stage2np + '/' + entity)

        except (AnalysisException) as error:
            logger.exception(str(error))

    def process_roster_data_from_stage1(self):
        """ Processes all roster data in stage1 and writes out to stage2 and stage2p """
        logger.info("Processing ms_insights roster data from: " + self.stage1np)

        items = mssparkutils.fs.ls(self.stage1np_roster)
        #print(items)
        for item in items:
            if item.isFile:
                self._process_roster_entity(item.path)

    def reset_roster_processing(self):
        """ Resets all stage1 to stage2 processing of roster data. """
        # cleanup stage2np
        if self.oea.path_exists(self.stage2np):
            # Delete roster tables (everything other than TechActivity)
            items = mssparkutils.fs.ls(self.stage2np)
            #print(file.name, file.isDir, file.isFile, file.path, file.size)
            for item in items:
                if item.name != 'TechActivity':
                    mssparkutils.fs.rm(item.path, True)
        # cleanup stage2p
        if self.oea.path_exists(self.stage2p):
            # Delete roster tables (everything other than TechActivity)
            items = mssparkutils.fs.ls(self.stage2p)
            #print(file.name, file.isDir, file.isFile, file.path, file.size)
            for item in items:
                if item.name != 'TechActivity':
                    mssparkutils.fs.rm(item.path, True)    
  

