In [None]:
class MSInsights(BaseOEAModule):
    """
    Provides data processing methods for MS Insights data.
    Data is expected to be received via ADS into stage1np/ms_insights
    The structure of the folders in stage1np will then be something like:
        -> stage1np/ms_insights/activity/2021-06-02
            -> stage1np/ms_insights/activity/2021-06-02/ApplicationUsage.csv
        -> stage1np/ms_insights/roster/2021-06-02T06-05-11/
            -> stage1np/ms_insights/roster/2021-06-02T06-05-11/AadUser
            -> stage1np/ms_insights/roster/2021-06-02T06-05-11/Person
            etc

    In stage2, everything is written to stage2np/ms_insights and stage2p/ms_insights
    """

    def __init__(self, oea, source_folder='ms_insights'):
        BaseOEAModule.__init__(self, oea, source_folder)

        self.stage1np_activity = self.stage1np + '/activity'
        self.stage1np_roster = self.stage1np + '/roster'

        self.schemas['TechActivity'] = [['SignalType', 'string', 'no-op'],
                        ['StartTime', 'timestamp', 'no-op'],
                        ['UserAgent', 'string', 'no-op'],
                        ['SignalId', 'string', 'no-op'],
                        ['SisClassId', 'string', 'no-op'],
                        ['ClassId', 'string', 'no-op'],
                        ['ChannelId', 'string', 'no-op'],
                        ['AppName', 'string', 'no-op'],
                        ['ActorId', 'string', 'hash-no-lookup'],
                        ['ActorRole', 'string', 'no-op'],
                        ['SchemaVersion', 'string', 'no-op'],
                        ['AssignmentId', 'string', 'no-op'],
                        ['SubmissionId', 'string', 'no-op'],
                        ['Action', 'string', 'no-op'],
                        ['DueDate', 'timestamp', 'no-op'],
                        ['ClassCreationDate', 'timestamp', 'no-op'],
                        ['Grade', 'string', 'no-op'],
                        ['SourceFileExtension', 'string', 'no-op'],
                        ['MeetingDuration', 'integer', 'no-op']]

        self.schemas['AadGroup'] = [['ObjectId', 'string', 'hash'],
                        ['DisplayName', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Mail', 'string', 'mask'],
                        ['MailNickname', 'string', 'mask'],
                        ['AnchorId', 'string', 'hash'],
                        ['SectionId', 'string', 'no-op']]                           
        self.schemas['AadGroupMembership'] = [['GroupObjectId', 'string', 'hash-no-lookup'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Role', 'string', 'no-op'],
                        ['UserObjectId', 'string', 'hash-no-lookup']]  
        self.schemas['AadUser'] = [['ObjectId', 'string', 'hash'],
                        ['AnchorId', 'string', 'hash'],
                        ['DisplayName', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['GivenName', 'string', 'mask'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Mail', 'string', 'mask'],
                        ['MailNickname', 'string', 'mask'],
                        ['Role', 'string', 'no-op'],
                        ['Surname', 'string', 'mask'],
                        ['UserPrincipalName', 'string', 'hash'],
                        ['StudentId', 'string', 'hash-no-lookup'],
                        ['TeacherId', 'string', 'hash-no-lookup']] 
        self.schemas['AadUserPersonMapping'] = [['ObjectId', 'string', 'hash-no-lookup'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup']] 
        self.schemas['Course'] = [['Id', 'string', 'no-op'],
                        ['AcademicYearSessionId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['OrganizationId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['Code', 'string', 'no-op']] 
        self.schemas['CourseGradeLevel'] = [['Id', 'string', 'no-op'],
                        ['CourseId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['RefGradeLevelId', 'string', 'no-op']] 
        self.schemas['CourseSubject'] = [['Id', 'string', 'no-op'],
                        ['CourseId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['RefAcademicSubjectId', 'string', 'no-op']] 
        self.schemas['Enrollment'] = [['Id', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefSectionRoleId', 'string', 'no-op'],
                        ['SectionId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['EntryDate', 'string', 'no-op'],
                        ['ExitDate', 'string', 'no-op'],
                        ['IsPrimaryStaffForSection', 'boolean', 'no-op']] 
        self.schemas['Organization'] = [['Id', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['RefOrganizationTypeId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['Identifier', 'string', 'no-op'],
                        ['ParentOrganizationId', 'string', 'no-op']] 
        self.schemas['Person'] = [['Id', 'string', 'hash'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['GivenName', 'string', 'mask'],
                        ['MiddleName', 'string', 'mask'],
                        ['PreferredGivenName', 'string', 'mask'],
                        ['PreferredMiddleName', 'string', 'mask'],
                        ['PreferredSurname', 'string', 'mask'],
                        ['Surname', 'string', 'mask']] 
        self.schemas['PersonDemographic'] = [['PersonId', 'string', 'hash'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['BirthCity', 'string', 'mask'],
                        ['BirthCountryCode', 'string', 'mask'],
                        ['BirthDate', 'string', 'mask'],
                        ['BirthState', 'string', 'mask'],
                        ['RefSexId', 'string', 'mask']] 
        self.schemas['PersonDemographicEthnicity'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefEthnicityId', 'string', 'mask']] 
        self.schemas['PersonDemographicPersonFlag'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefPersonFlagId', 'string', 'mask']] 
        self.schemas['PersonDemographicRace'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefRaceId', 'string', 'mask']] 
        self.schemas['PersonEmailAddress'] = [['Id', 'string', 'no-op'],
                        ['EmailAddress', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['PriorityOrder', 'short', 'no-op'],
                        ['RefEmailAddressTypeId', 'string', 'no-op']] 
        self.schemas['PersonIdentifier'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['Identifier', 'string', 'mask'],
                        ['IsPresentInSource', 'boolean', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefIdentifierTypeId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op']] 
        self.schemas['PersonOrganizationRole'] = [['Id', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['OrganizationId', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefRoleId', 'string', 'no-op'],
                        ['SessionId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['IsPrimary', 'boolean', 'no-op'],
                        ['RefGradeLevelId', 'string', 'no-op'],
                        ['RoleEndDate', 'string', 'mask'],
                        ['RoleStartDate', 'string', 'mask']] 
        self.schemas['PersonPhoneNumber'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['PhoneNumber', 'string', 'mask'],
                        ['PriorityOrder', 'short', 'no-op'],
                        ['RefPhoneNumberTypeId', 'string', 'no-op']] 
        self.schemas['PersonRelationship'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefPersonRelationshipId', 'string', 'no-op'],
                        ['RelatedPersonId', 'string', 'hash-no-lookup']] 
        self.schemas['RefDefinition'] = [['Id', 'string', 'no-op'],
                        ['Code', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Namespace', 'string', 'no-op'],
                        ['RefType', 'string', 'no-op'],
                        ['SortOrder', 'short', 'no-op']] 
        self.schemas['Section'] = [['Id', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['OrganizationId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['Code', 'string', 'no-op'],
                        ['CourseId', 'string', 'no-op'],
                        ['Location', 'string', 'no-op']] 
        self.schemas['SectionGradeLevel'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['RefGradeLevelId', 'string', 'no-op'],
                        ['SectionId', 'string', 'no-op']] 
        self.schemas['SectionSession'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['SectionId', 'string', 'no-op'],
                        ['SessionId', 'string', 'no-op']] 
        self.schemas['SectionSubject'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['RefAcademicSubjectId', 'string', 'no-op'],
                        ['SectionId', 'string', 'no-op']] 
        self.schemas['Session'] = [['Id', 'string', 'no-op'],
                        ['EndDate', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['RefAcademicYearId', 'string', 'no-op'],
                        ['RefSessionTypeId', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['StartDate', 'string', 'no-op'],
                        ['ParentSessionId', 'string', 'no-op']] 
        self.schemas['SourceSystem'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op']] 
    
    def process_activity(self):
        """ Processes activity data from stage1 into stage2 using structured streaming. """
        logger.info("Processing ms_insights activity data from: " + self.stage1np_activity)

        spark_schema = self.oea.to_spark_schema(self.schemas['TechActivity'])
        df = spark.readStream.csv(self.stage1np_activity + '/*/*.csv', header='false', schema=spark_schema)
        df = df.dropDuplicates(['SignalId'])
        df = df.withColumn('year', F.year(F.col('StartTime'))).withColumn('month', F.month(F.col('StartTime')))

        df_pseudo, df_lookup = self.oea.pseudonymize(df, self.schemas['TechActivity'])

        query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_activity + '/_checkpoints').partitionBy('year', 'month')
        query = query.start(self.stage2p + '/TechActivity')
        query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
        logger.info(query.lastProgress)

    def reset_activity_processing(self):
        """ Resets all TechActivity processing. This is intended for use during initial testing - use with caution.
            - deletes the _checkpoints dir from stage1/M365/activity
            - deletes the delta table at stage2/ms_insights/TechActivity
        """
        self.oea.rm_if_exists(self.stage2p + '/TechActivity')
        self.oea.rm_if_exists(self.stage1np_activity + '/_checkpoints')
        logger.info(f"Deleted {self.stage2p + '/TechActivity'} and {self.stage1np_activity + '/_checkpoints'}")  

    def _process_roster_entity(self, path, entity):
        try:
            logger.debug(f"Processing roster entity: path={path}, entity={entity}")
            spark_schema = self.oea.to_spark_schema(self.schemas[entity])
            df = spark.read.csv(path + '/' + entity, header='false', schema=spark_schema)
            df_pseudo, df_lookup = self.oea.pseudonymize(df, self.schemas[entity])

            if len(df_pseudo.columns) > 0: 
                df_pseudo.write.format('delta').mode('overwrite').option("mergeSchema", "true").save(self.stage2p + '/' + entity)
            if len(df_lookup.columns) > 0: 
                df_lookup.write.format('delta').mode('overwrite').option("mergeSchema", "true").save(self.stage2np + '/' + entity + '_lookup')

        except (AnalysisException) as error:
            logger.exception(str(error))

    def _process_roster_date_folder(self, date_folder_path):
        folders = self.oea.get_folders(date_folder_path)
        for table_name in folders:
            self._process_roster_entity(date_folder_path, table_name)

    def process_roster(self):
        """ Processes all roster data in stage1 and writes out to stage2 and stage2p """
        logger.info("Processing ms_insights roster data from: " + self.stage1np)

        items = mssparkutils.fs.ls(self.stage1np + '/roster')
        for item in items:
            if item.isDir:
                self._process_roster_date_folder(item.path)
                mssparkutils.fs.mv(item.path, self.stage1np + '/roster_processed/', True)

    def reset_roster_processing(self):
        """ Resets all stage1 to stage2 processing of roster data. """
        # cleanup stage2np
        if self.oea.path_exists(self.stage2np):
            # Delete roster delta tables (everything other than TechActivity)
            items = mssparkutils.fs.ls(self.stage2np)
            #print(file.name, file.isDir, file.isFile, file.path, file.size)
            for item in items:
                if item.name != 'TechActivity':
                    mssparkutils.fs.rm(item.path, True)
        # cleanup stage2p
        if self.oea.path_exists(self.stage2p):
            # Delete roster delta tables (everything other than TechActivity)
            items = mssparkutils.fs.ls(self.stage2p)
            #print(file.name, file.isDir, file.isFile, file.path, file.size)
            for item in items:
                if item.name != 'TechActivity':
                    mssparkutils.fs.rm(item.path, True)                    

        if self.oea.path_exists(self.stage1np + '/roster_processed'):
            # Move roster data back in to "inbound" folder
            items = mssparkutils.fs.ls(self.stage1np + '/roster_processed')
            for item in items:
                print(item.path)
                mssparkutils.fs.mv(item.path, self.stage1np + '/roster', True)
        logger.info("Done. Removed roster data from stage2 and moved everything in stage1 from roster_processed folder back into stage1np/ms_insights/roster")
  
