# Microsoft Insights Class

In [None]:
class Insights(BaseOEAModule):
    """
    Provides data processing methods for MS Insights data.
    Data is expected to be received via ADS into stage1np/ms_insights
    The structure of the folders in stage1np will then be something like:
        -> stage1np/ms_insights/activity/2021-06-02
            -> stage1np/ms_insights/activity/2021-06-02/ApplicationUsage.csv
        -> stage1np/ms_insights/roster/2021-06-02T06-05-11/
            -> stage1np/ms_insights/roster/2021-06-02T06-05-11/AadUser
            -> stage1np/ms_insights/roster/2021-06-02T06-05-11/Person
            etc

    In stage2, everything is written to stage2np/ms_insights and stage2p/ms_insights
    """

    def __init__(self, source_folder='M365'):
        BaseOEAModule.__init__(self, source_folder)

        self.stage1np_activity = self.stage1np + '/activity'
        self.stage1np_roster = self.stage1np + '/roster'

        self.schemas['TechActivity'] = [['SignalType', 'string', 'no-op'],
                        ['StartTime', 'timestamp', 'no-op'],
                        ['UserAgent', 'string', 'no-op'],
                        ['SignalId', 'string', 'no-op'],
                        ['SisClassId', 'string', 'no-op'],
                        ['ClassId', 'string', 'no-op'],
                        ['ChannelId', 'string', 'no-op'],
                        ['AppName', 'string', 'no-op'],
                        ['ActorId', 'string', 'hash-no-lookup'],
                        ['ActorRole', 'string', 'no-op'],
                        ['SchemaVersion', 'string', 'no-op'],
                        ['AssignmentId', 'string', 'no-op'],
                        ['SubmissionId', 'string', 'no-op'],
                        ['SubmissionCreatedTime', 'timestamp', 'no-op'],
                        ['Action', 'string', 'no-op'],
                        ['DueDate', 'timestamp', 'no-op'],
                        ['ClassCreationDate', 'timestamp', 'no-op'],
                        ['Grade', 'string', 'no-op'],
                        ['SourceFileExtension', 'string', 'no-op'],
                        ['MeetingDuration', 'string', 'no-op'], 
                        ['MeetingSessionId', 'string', 'no-op'],
                        ['MeetingType', 'string', 'no-op'], 
                        ['ReadingSubmissionWordsPerMinute', 'integer', 'no-op'],
                        ['ReadingSubmissionAccuracyScore', 'string', 'no-op'],
                        ['ReadingSubmissionMispronunciationCount', 'integer', 'no-op'],
                        ['ReadingSubmissionRepetitionsCount', 'integer', 'no-op'],
                        ['ReadingSubmissionInsertionsCount', 'integer', 'no-op'],
                        ['ReadingSubmissionObmissionCount', 'integer', 'no-op'],
                        ['ReadingSubmissionAttemptNumber', 'integer', 'no-op'],
                        ['ReadingAssignmentWordCount', 'integer', 'no-op'],
                        ['ReadingAssignmentFleschKincaidGradeLevel', 'string', 'no-op'],
                        ['ReadingAssignmentLanguage', 'string', 'no-op'],
                        ['year', 'integer', 'partition-by']]

        self.schemas['AadGroup'] = [['ObjectId', 'string', 'hash'],
                        ['DisplayName', 'string', 'mask'],
                        ['Mail', 'string', 'mask'],
                        ['MailNickname', 'string', 'mask'],
                        ['AnchorId', 'string', 'hash'],
                        ['SectionId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']]                           
        self.schemas['AadGroupMembership'] = [['UserObjectId', 'string', 'hash-no-lookup'],
                        ['GroupObjectId', 'string', 'hash-no-lookup'],
                        ['Role', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']]  
        self.schemas['AadUser'] = [['ObjectId', 'string', 'hash'],
                        ['UserPrincipalName', 'string', 'hash'],
                        ['Mail', 'string', 'mask'],
                        ['MailNickName', 'string', 'mask'],
                        ['GivenName', 'string', 'mask'],
                        ['Surname', 'string', 'mask'],
                        ['DisplayName', 'string', 'mask'],
                        ['AnchorId', 'string', 'hash'],
                        ['StudentId', 'string', 'hash-no-lookup'],
                        ['TeacherId', 'string', 'hash-no-lookup'],
                        ['Role', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['AadUserPersonMapping'] = [['ObjectId', 'string', 'hash-no-lookup'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['Course'] = [['Id', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['OrganizationId', 'string', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['Code', 'string', 'no-op'],
                        ['AcademicYearSessionId', 'string', 'no-op']] 
        self.schemas['CourseGradeLevel'] = [['Id', 'string', 'no-op'],
                        ['CourseId', 'string', 'no-op'],
                        ['RefGradeLevelId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['CourseSubject'] = [['Id', 'string', 'no-op'],
                        ['CourseId', 'string', 'no-op'],
                        ['RefAcademicSubjectId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['Enrollment'] = [['Id', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['SectionId', 'string', 'no-op'],
                        ['RefSectionRoleId', 'string', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['IsPrimaryStaffForSection', 'boolean', 'no-op'],
                        ['EntryDate', 'string', 'no-op'],
                        ['ExitDate', 'string', 'no-op']] 
        self.schemas['Organization'] = [['Id', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['Identifier', 'string', 'no-op'],
                        ['RefOrganizationTypeId', 'string', 'no-op'],
                        ['ParentOrganizationId', 'string', 'no-op']] 
        self.schemas['Person'] = [['Id', 'string', 'hash'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Surname', 'string', 'mask'],
                        ['GivenName', 'string', 'mask'],
                        ['MiddleName', 'string', 'mask'],
                        ['PreferredSurname', 'string', 'mask'],
                        ['PreferredGivenName', 'string', 'mask'],
                        ['PreferredMiddleName', 'string', 'mask']] 
        self.schemas['PersonDemographic'] = [['PersonId', 'string', 'hash'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['RefSexId', 'string', 'mask'],
                        ['BirthDate', 'string', 'mask'],
                        ['BirthCity', 'string', 'mask'],
                        ['BirthState', 'string', 'mask'],
                        ['BirthCountryCode', 'string', 'mask']] 
        self.schemas['PersonDemographicEthnicity'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefEthnicityId', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['PersonDemographicPersonFlag'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefPersonFlagId', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['PersonDemographicRace'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefRaceId', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['PersonEmailAddress'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['EmailAddress', 'string', 'mask'],
                        ['PriorityOrder', 'short', 'no-op'],
                        ['RefEmailAddressTypeId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['PersonIdentifier'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['RefIdentifierTypeId', 'string', 'no-op'],
                        ['Identifier', 'string', 'mask'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['IsPresentInSource', 'boolean', 'no-op']] 
        self.schemas['PersonOrganizationRole'] = [['Id', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['OrganizationId', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RefRoleId', 'string', 'no-op'],
                        ['SessionId', 'string', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op'],
                        ['RoleStartDate', 'string', 'mask'],
                        ['RoleEndDate', 'string', 'mask'],
                        ['IsPrimary', 'boolean', 'no-op'],
                        ['RefGradeLevelId', 'string', 'no-op']] 
        self.schemas['PersonPhoneNumber'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['PhoneNumber', 'string', 'mask'],
                        ['PriorityOrder', 'short', 'no-op'],
                        ['RefPhoneNumberTypeId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['PersonRelationship'] = [['Id', 'string', 'no-op'],
                        ['PersonId', 'string', 'hash-no-lookup'],
                        ['RelatedPersonId', 'string', 'hash-no-lookup'],
                        ['RefPersonRelationshipId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['RefDefinition'] = [['Id', 'string', 'no-op'],
                        ['Namespace', 'string', 'no-op'],
                        ['RefType', 'string', 'no-op'],
                        ['Code', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['SortOrder', 'short', 'no-op']] 
        self.schemas['RefTranslation'] = [['Id', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Language', 'string', 'no-op'],
                        ['LocalizedName', 'string', 'no-op']] 
        self.schemas['Section'] = [['Id', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['OrganizationId', 'string', 'no-op'],
                        ['CourseId', 'string', 'no-op'],
                        ['Code', 'string', 'no-op'],
                        ['Location', 'string', 'no-op']] 
        self.schemas['SectionGradeLevel'] = [['Id', 'string', 'no-op'],
                        ['SectionId', 'string', 'no-op'],
                        ['RefGradeLevelId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['SectionSession'] = [['Id', 'string', 'no-op'],
                        ['SectionId', 'string', 'no-op'],
                        ['SessionId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['IsActiveInSession', 'boolean', 'no-op']] 
        self.schemas['SectionSubject'] = [['Id', 'string', 'no-op'],
                        ['SectionId', 'string', 'no-op'],
                        ['RefAcademicSubjectId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op']] 
        self.schemas['Session'] = [['Id', 'string', 'no-op'],
                        ['SourceSystemId', 'string', 'no-op'],
                        ['ExternalId', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['RefSessionTypeId', 'string', 'no-op'],
                        ['RefAcademicYearId', 'string', 'no-op'],
                        ['StartDate', 'string', 'no-op'],
                        ['EndDate', 'string', 'no-op'],
                        ['ParentSessionId', 'string', 'no-op']] 
        self.schemas['SourceSystem'] = [['Id', 'string', 'no-op'],
                        ['Name', 'string', 'no-op'],
                        ['FirstSeenDateTime', 'timestamp', 'no-op'],
                        ['LastSeenDateTime', 'timestamp', 'no-op']] 
    
    def ingest(self):
        """  Processes insights data from stage1 into stage2 using structured streaming within the defined functions below."""
        logger.info("Processing microsoft_insights data from: " + self.stage1np)
        
        items = mssparkutils.fs.ls(self.stage1np)
        for item in items:
            if item.name == "activity":
                self.process_insights_activity_stage1_data()
            elif item.name == "roster":
                self.process_roster()
            elif item.name == "schemas":
                logger.info("ignoring ingestion of the schemas folder")
            elif item.name == "current.manifest.cdm.json":
                logger.info("ignoring ingestion of the manifest json")
            else:
                logger.info("No defined function for processing this insights data")
        
        logger.info("Finished ingesting insights data from stage 1 to stage 2")

    def process_insights_activity_stage1_data(self):
        """ Processes activity data from stage1 into stage2 using structured streaming. """
        logger.info("Processing ms_insights activity data from: " + self.stage1np_activity)

        # Currently not using the OEA ingest_incremental_data function due to pulling out the partition folders
        activity_spark_schema = oea.to_spark_schema(self.schemas['TechActivity'])
        df = spark.readStream.csv(self.stage1np_activity + '/*/*.csv', header='false', schema=activity_spark_schema)
        df = df.dropDuplicates(['SignalId'])
        df = df.withColumn('year', F.year(F.col('StartTime'))).withColumn('month', F.month(F.col('StartTime')))

        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['TechActivity'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_activity + '/_checkpoints_p').partitionBy('year')
            query = query.start(self.stage2p + '/TechActivity_pseudo')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            logger.info(query.lastProgress)
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_activity + '/_checkpoints_np').partitionBy('year')
            query2 = query2.start(self.stage2np + '/TechActivity_lookup')
            query2.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            logger.info(query2.lastProgress)   

    def _process_roster_entity(self, path, entity):
        try:
            p_destination_path = self.stage2p + '/' + entity + '_pseudo'
            np_destination_path = self.stage2np + '/' + entity + '_lookup'
            source_path = path + '/' + entity
            spark_schema = oea.to_spark_schema(self.schemas[entity])
            df = spark.read.load(source_path, format='csv', header='false', schema=spark_schema)
            df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas[entity])

            if len(df_pseudo.columns) == 0:
                logger.info('No data to be written to stage2p')
            else:
                df_pseudo.write.save(p_destination_path, format='delta', mode='overwrite') 

            if len(df_lookup.columns) == 0:
                logger.info('No data to be written to stage2np')
            else:
                df_lookup.write.save(np_destination_path, format='delta', mode='overwrite') 

        except (AnalysisException) as error:
            logger.exception(str(error))


    def _process_roster_date_folder(self, date_folder_path):
        folders = oea.get_folders(date_folder_path)
        for table_name in folders:
            self._process_roster_entity(date_folder_path, table_name)

    def process_roster(self):
        """ Processes all roster data in stage1 and writes out to stage2 and stage2p """
        logger.info("Processing ms_insights roster snapshot data from: " + self.stage1np_roster)

        latest_batch = oea.get_latest_folder(self.stage1np_roster)
        source_path = self.stage1np_roster + '/' + latest_batch
        self._process_roster_date_folder(source_path)
  
