# Digital Activity Schema Standard Class Notebook

This schema standardization class notebook outlines the 3 necessary functions for processing module schemas into the OEA digital activity schema standard.

These 4 functions are:

 - get_digital_activity_schema - which defines what the digital activity standard consists of, and how to map other data sources to this OEA standard.
 - reset_digital_activity_processing - which deletes the "digital_activity" folder from stage 2p, allowing you to start over the schema standardization process. 
 - initialize_standardization - which takes in the user-defined module and tables wanted for processing (at the pipeline-level), does any pre-processing needed for the data, and pushes those tables to "_process_digital_activity".
 - _process_digital_activity - which takes in the user-defined schema mapping and source folder, while executing the standardization process to be re-written back to stage 2p.

 Any custom/additional data source schema mappings should be defined here.

In [None]:
class DigitalActivity(BaseOEAModule):
    """
    Currently, package class notebook only contains processing for stage2p data.
     - Reads activity data from stage2p, writes the activity data schema/relationship mapping to stage2p again
     - Then takes this mapping and generalized and method for writing to stage3p
    """

    def __init__(self, source_folder='digital_activity'):
        BaseOEAModule.__init__(self, oea, source_folder)

        self.schemas['ActivityEvents'] = [['event_id', 'string', 'no-op'],
                        ['event_type', 'string', 'no-op'],
                        ['event_actor', 'string', 'no-op'],
                        ['event_object', 'string', 'no-op'],
                        ['event_eventTime', 'string', 'no-op'],
                        ['entity_type', 'string', 'no-op'], 
                        ['softwareApplication_version', 'string', 'no-op'], 
                        ['generated_aggregateMeasure_metric_timeOnTaskSec', 'string', 'no-op'], 
                        ['generated_aggregateMeasure_metric_numAccess', 'string', 'no-op'],
                        ['generated_aggregateMeasure_metric_used', 'string', 'no-op'],
                        ['generated_aggregateMeasure_metric_activityReportPeriod', 'string', 'no-op'], 
                        ['year', 'integer', 'no-op'], 
                        ['month', 'integer', 'no-op']]

        self.schemasDetail = {}
        self.schemasDetail['ActivityEvents'] = [['schema_source', 'https://www.imsglobal.org/spec/caliper/v1p2#tooluseevent'],
                        ['event_id','unique ID used as a signal key'],
                        ['event_type', 'type of activity event'],
                        ['event_actor', 'student or teacher that created the signal'],
                        ['event_object', 'entity that comprises the object of the interaction'],
                        ['event_eventTime', 'date/timestamp of the activity signal'],
                        ['entity_type', 'value that describes the properties of the user agent hosting this SoftwareApplication.'],
                        ['softwareApplication_version', 'value that describes the properties of the user agent hosting this SoftwareApplication.'],
                        ['generated_aggregateMeasure_metric_timeOnTaskSec', 'time on task in seconds'],
                        ['generated_aggregateMeasure_metric_numAccess', 'number of accesses'], 
                        ['generated_aggregateMeasure_metric_used', 'used true or false'], 
                        ['generated_aggregateMeasure_metric_activityReportPeriod', 'activity data collected is reported over this number of days']]

        # Schema mapping for Module_Table 
        self.schemaMappings = {}
        self.schemaMappings['M365_TechActivity'] =  [['event_id', 'SignalId'],
                                                ['event_type', 'SignalType'], 
                                                ['event_actor', 'ActorId_pseudonym'],
                                                ['event_object', 'MS_Insights'],
                                                ['event_eventTime', 'StartTime'],
                                                ['entity_type', 'AppName'],
                                                ['softwareApplication_version', 'SchemaVersion'],
                                                ['generated_aggregateMeasure_metric_timeOnTaskSec', 'MeetingDuration']]
        self.schemaMappings['GraphAPI_M365'] = [['event_actor', 'userPrincipalName_pseudonym'],
                                                ['event_object', 'MS_GraphAPI_M365'],
                                                ['event_eventTime', 'reportRefreshDate'],
                                                ['entity_type', 'm365_app_name'],
                                                ['generated_aggregateMeasure_metric_used', 'used'],
                                                ['generated_aggregateMeasure_metric_activityReportPeriod', 'reportPeriod']]
        self.schemaMappings['GraphAPI_Teams'] = [['event_type', 'meetings_and_messages'],
                                                ['event_actor', 'userPrincipalName_pseudonym'],
                                                ['event_object', 'MS_GraphAPI_Teams'],
                                                ['event_eventTime', 'reportRefreshDate'],
                                                ['generated_aggregateMeasure_metric_timeOnTaskSec', 'videoDuration'],
                                                ['generated_aggregateMeasure_metric_numAccess', 'counts'],
                                                ['generated_aggregateMeasure_metric_activityReportPeriod', 'reportPeriod']]
        self.schemaMappings['Clever_DailyParticipation'] = [['event_actor', 'sis_id_pseudonym'],
                                                ['event_object', 'Clever_Daily_Participation'],
                                                ['event_eventTime', 'date'],
                                                ['generated_aggregateMeasure_metric_used', 'active'],
                                                ['generated_aggregateMeasure_metric_numAccess', 'num_logins']]
        self.schemaMappings['Clever_ResourceUsage'] = [['event_type', 'resource_type'],
                                                ['event_actor', 'sis_id_pseudonym'],
                                                ['event_object', 'Clever_Resource_Usage'],
                                                ['event_eventTime', 'date'],
                                                ['entity_type', 'resource_name'],
                                                ['generated_aggregateMeasure_metric_numAccess', 'num_access']]
        self.schemaMappings['iReady_Comp_ELA'] = [['event_type', 'Subject'],
                                                ['event_actor', 'StudentID_pseudonym'],
                                                ['event_object', 'iReady_Comprehensive_Student_Lesson_Activity_with_Standards_ELA'],
                                                ['event_eventTime', 'CompletionDate'],
                                                ['entity_type', 'Domain'],
                                                ['generated_aggregateMeasure_metric_timeOnTaskSec', 'TotalTimeonLesson_sec_']]
        self.schemaMappings['iReady_Comp_Math'] = [['event_type', 'Subject'],
                                                ['event_actor', 'StudentID_pseudonym'],
                                                ['event_object', 'iReady_Comprehensive_Student_Lesson_Activity_with_Standards_Math'],
                                                ['event_eventTime', 'CompletionDate'],
                                                ['entity_type', 'Domain'],
                                                ['generated_aggregateMeasure_metric_timeOnTaskSec', 'TotalTimeonLesson_sec_']]


    def get_digital_activity_schema(self):
        """ Get information on digital activity schema
            - needed to align schemas to activity data source
        """
        print("OEA Standard Digital Activity Schema:\n")
        
        print("Columns and data types:\n")
        for var in self.schemas['ActivityEvents']:
            print(var)
        
        print("\nColumn descriptions:\n")
        for var in self.schemasDetail['ActivityEvents']:
            print(var)


    def reset_digital_activity_processing(self):
        """ Resets all data. This is intended for use during initial testing - use with caution.
            - deletes the delta table at stage2/digital_activity
        """
        oea.rm_if_exists(self.stage2p)
        logger.info(f"Deleted {self.stage2p}")

    def initialize_standardization(self, module, table):
        """ Initializes digital activity schema standardization.
            - takes in the module and table expected to be standardized
            - checks to ensure that module and table has defined processing
            - some modules/tables have data pre-processing for consistency
        """
        import pandas as pd

        logger.info("Initializing standardization for digital activity schemas")

        # module: insights, table: TechActivity
        if (module == 'M365') & (table == 'TechActivity_pseudo'):
            # NOTE: pre-processing step (takes the student IDs in the Activity table, assigned from the Insights Person table, and maps them to the AADUser student IDs)
                # Then transforms the MeetingDuration column from a duration to the total time, in seconds.
            dfInsights_activity = oea.load(module,table)
            dfInsights_studentIDMapping = oea.load('M365', 'AadUserPersonMapping_pseudo')
            dfInsights = dfInsights_activity.join(dfInsights_studentIDMapping, dfInsights_activity.ActorId_pseudonym == dfInsights_studentIDMapping.PersonId_pseudonym, how='inner')
            dfInsights = dfInsights.drop(F.col('ActorId_pseudonym'))
            dfInsights = dfInsights.withColumnRenamed('PersonId_pseudonym', 'ActorId_pseudonym')
            dfInsights = dfInsights.select(dfInsights_activity.columns[:])
            dfInsights = dfInsights.withColumn('MeetingDuration', 
            F.coalesce(F.regexp_extract('MeetingDuration', r'(\d+):(\d+):(\d+)', 1).cast('int'), F.lit(0)) * 3600 + 
            F.coalesce(F.regexp_extract('MeetingDuration', r'(\d+):(\d+):(\d+)', 2).cast('int'), F.lit(0)) * 60 + 
            F.coalesce(F.regexp_extract('MeetingDuration', r'(\d+):(\d+):(\d+)', 3).cast('int'), F.lit(0))
            )
            dfInsights = dfInsights.select(dfInsights_activity.columns[:])
            dfInsights.write.save(oea.path('stage2p', directory_path="temp"), format='delta', mode='append', mergeSchema='true')

            source_path = 'stage2p/temp'
            self._process_digital_activity(source_path, self.schemaMappings['M365_TechActivity'])
            oea.rm_if_exists(oea.path('stage2p', directory_path="temp"))
        # module: graph_api, table: m365_app_user_detail
        elif (module == 'graph_api') & (table == 'm365_app_user_detail_pseudo'):
            # NOTE: pre-processing step (melts the usage columns into two)
            dfGraph_M365 = oea.load(module,table)
            dfPand = dfGraph_M365.toPandas()
            dfPandMelt = dfPand.melt(id_vars = ['userPrincipalName_pseudonym', 'reportRefreshDate', 'reportPeriod'],value_vars = ['excel','oneNote', 'outlook', 'powerPoint', 'teams', 'word'],var_name='m365_app_name',value_name='used')
            dfGraph_M365 = spark.createDataFrame(dfPandMelt)
            dfGraph_M365.write.save(oea.path('stage2p', directory_path="temp"), format='delta', mode='append', mergeSchema='true')

            source_path = 'stage2p/temp'
            self._process_digital_activity(source_path, self.schemaMappings['GraphAPI_M365'])
            oea.rm_if_exists(oea.path('stage2p', directory_path="temp"))
        # module: graph_api, table: teams_activity_user_detail
        elif (module == 'graph_api') & (table == 'teams_activity_user_detail_pseudo'):
            # NOTE: pre-processing step (melts the message and meeting count columns into two)
            dfGraph_Teams = oea.load(module,table)
            dfPand = dfGraph_Teams.toPandas()
            dfPandMelt = dfPand.melt(id_vars = ['userPrincipalName_pseudonym', 'reportRefreshDate', 'reportPeriod', 'videoDuration'],value_vars = ['callCount', 'meetingCount', 'meetingsAttendedCount', 'meetingsOrganizedCount', 'privateChatMessageCount', 'teamChatMessageCount'],var_name='meetings_and_messages',value_name='counts')
            dfGraph_Teams_counts = spark.createDataFrame(dfPandMelt)
            dfGraph_Teams_counts.write.save(oea.path('stage2p', directory_path="temp"), format='delta', mode='append', mergeSchema='true')

            source_path = 'stage2p/temp'
            self._process_digital_activity(source_path, self.schemaMappings['GraphAPI_Teams'])
            oea.rm_if_exists(oea.path('stage2p', directory_path="temp"))
        # module: clever, table: daily_participation
        elif (module == 'clever') & (table == 'daily_participation_pseudo'):
            source_path = 'stage2p/clever/daily_participation_pseudo'
            self._process_digital_activity(source_path, self.schemaMappings['Clever_DailyParticipation'])
        # module: clever, table: resource_usage
        elif (module == 'clever') & (table == 'resource_usage_pseudo'):
            source_path = 'stage2p/clever/resource_usage_pseudo'
            self._process_digital_activity(source_path, self.schemaMappings['Clever_ResourceUsage'])
        # module: iready, table: comprehensive_..._ela
        elif (module == 'iready') & (table == 'comprehensive_student_lesson_activity_with_standards_ela_pseudo'):
            # NOTE: pre-processing step (convert original time in min to standard in sec)
            dfiReady_Comp_ELA = oea.load(module,table)
            dfiReady_Comp_ELA = dfiReady_Comp_ELA.withColumn('TotalTimeonLesson_min_', F.col('TotalTimeonLesson_min_')*60)
            dfiReady_Comp_ELA = dfiReady_Comp_ELA.withColumnRenamed('TotalTimeonLesson_min_', 'TotalTimeonLesson_sec_')
            dfiReady_Comp_ELA.write.save(oea.path('stage2p', directory_path="temp"), format='delta', mode='append', mergeSchema='true')

            source_path = 'stage2p/temp'
            self._process_digital_activity(source_path, self.schemaMappings['iReady_Comp_ELA'])
            oea.rm_if_exists(oea.path('stage2p', directory_path="temp"))
        # module: iready, table: comprehensive_..._math
        elif (module == 'iready') & (table == 'comprehensive_student_lesson_activity_with_standards_math_pseudo'):
            # NOTE: pre-processing step (convert original time in min to standard in sec)
            dfiReady_Comp_Math = oea.load(module,table)
            dfiReady_Comp_Math = dfiReady_Comp_Math.withColumn('TotalTimeonLesson_min_', F.col('TotalTimeonLesson_min_')*60)
            dfiReady_Comp_Math = dfiReady_Comp_Math.withColumnRenamed('TotalTimeonLesson_min_', 'TotalTimeonLesson_sec_')
            dfiReady_Comp_Math.write.save(oea.path('stage2p', directory_path="temp"), format='delta', mode='append', mergeSchema='true')

            source_path = 'stage2p/temp'
            self._process_digital_activity(source_path, self.schemaMappings['iReady_Comp_Math'])
            oea.rm_if_exists(oea.path('stage2p', directory_path="temp"))
        else: 
            logger.info("No defined process for the module or table initialized")
    
    def _process_digital_activity(self,source_path,schemaMapping):
        """ Processes digital activity data into standardized table with standard activity schema.
            - source_path: Storage location and directory name of table after pre-processing (e.g. stage2p/temp) - used for appending to the digital_activity table
            - checkpoint_path: Storage location and directory name of table before pre-processing (e.g. stage2p/graph_api/m365_app_user_detail_pseudo) - used for checkpoint location for digital_activity table
            - schemaMapping: mapping of table columns to ActivityEvents schema, NULL if no mapping provided
        """
        from pyspark.sql.functions import lit

        logger.info("Processing digital activity data from: " + source_path)
        
        dfActivity = oea.load_delta(source_path)

        df = spark.createDataFrame(schemaMapping, schema = ["schema", "source"])
        df = df.na.drop("any")

        obj = df.filter(df['schema'] == "event_object").collect()[0][1]
        dfCols = df.filter(df['schema'] != "event_object")

        colList = dfCols.select('source').collect()
        colList = [col.source for col in colList]

        df = dfActivity.select(colList)

        # rename source column to be schema column
        for row in dfCols.rdd.collect():
            schemaCol = row[0]
            sourceCol = row[1]
            df = df.withColumnRenamed(sourceCol, schemaCol)
            df = df.withColumn(schemaCol, df[schemaCol].cast(StringType()))
                    
        df = df.withColumn('event_object', F.lit(obj))
        df = df.withColumn('year', F.year(F.col('event_eventTime'))).withColumn('month', F.month(F.col('event_eventTime')))
        df.write.save(oea.path('stage2p', directory_path="digital_activity"), format='delta', mode='append', partitionBy=['year', 'month'], mergeSchema='true')
        
        logger.info("Complete processing from: " + source_path)
