# Chronic Absenteeism Package Class Notebook

Currently contained/provided within this notebook: 
 - Method for reading and writing to stage 2p, and then also writing to stage 3p (goal for package: separate methods for p vs. np processing).
 - Activity Data Schema Mapping (as defined for Insights data at this point).

 Creates 1 new table (based on IMS Globals Caliper's standards with a few slight changes): Event.
 Below describes the schema/column mappings from Activity tables to the new, OEA standardized tables:
1. Event:
    - __id:__ unique ID used as a signal key *(e.g. Insights Activity Table: SignalId)*
    - __type:__ type of Activity signal *(e.g. Insights Activity Table: AppName)*
    - __actor:__ student or teacher that created the signal *(e.g. Insights Activity Table: ActorId)*
    - __ipAddress:__ IP address origination of signal *(e.g. Graph SignInAuditLogs Activity Table: ipAddress)*
    - __action:__ description of signal (or signal type) *(e.g. Insights Activity Table: SignalType)*
    - __object:__ mapping to SoftwareApplication table for activity signal data source *(e.g. "Insights")*
    - __eventTime:__ date/timestamp of the activity signal *(e.g. Insights Activity Table: StartTime)*
    - __details:__ measures/details on some activities *(e.g. Insights Activity Table: MeetingDuration if the SignalType is Meeting)*
    - __version:__ current schema version being processed *(e.g. Insights Activity Table: SchemaVersion)*


In [None]:
# Unsure if this is the best route
class ChronicAbsenteeism(BaseOEAModule):
    """
    Currently, package class notebook only contains processing for stage2p data.
     - Reads activity data from stage2p, writes the activity data schema/relationship mapping to stage2p again
     - Then takes this mapping and generalized and method for writing to stage3p
    """

    def __init__(self, source_folder='ActivitySchema'):
        BaseOEAModule.__init__(self, source_folder)
        print(self.stage2p)
        
        self.stage2p_insightsActivity = oea.path('stage2p', directory_path='M365')
        print(self.stage2p_insightsActivity)

        # work on partitioning and model history versioning place
        self.schemas['ActivityEvents'] = [['id', 'string', 'no-op'],
                        ['type', 'string', 'no-op'],
                        ['actor', 'string', 'no-op'],
                        ['action', 'string', 'no-op'],
                        ['object', 'string', 'no-op'],
                        ['eventTime', 'timestamp', 'no-op'],
                        ['details', 'string', 'no-op'], 
                        ['version', 'string', 'no-op']]

        self.schemas['TechActivity'] = [['SignalType', 'string', 'no-op'],
                        ['StartTime', 'string', 'no-op'],
                        ['UserAgent', 'string', 'no-op'],
                        ['SignalId', 'string', 'hash'],
                        ['SisClassId', 'string', 'hash'],
                        ['ClassId', 'string', 'hash'],
                        ['ChannelId', 'string', 'hash'],
                        ['AppName', 'string', 'no-op'],
                        ['ActorId', 'string', 'hash'],
                        ['ActorRole', 'string', 'no-op'],
                        ['SchemaVersion', 'string', 'no-op'],
                        ['AssignmentId', 'string', 'hash'],
                        ['SubmissionId', 'string', 'hash'],
                        ['SubmissionCreatedTime', 'timestamp', 'no-op'],
                        ['Action', 'string', 'no-op'],
                        ['DueDate', 'string', 'no-op'],
                        ['ClassCreationDate', 'string', 'no-op'],
                        ['Grade', 'string', 'no-op'],
                        ['SourceFileExtension', 'string', 'no-op'],
                        ['MeetingDuration', 'string', 'no-op'],
                        ['MeetingSessionId', 'string', 'hash'],
                        ['MeetingType', 'string', 'no-op']]
    

    # might be able to remove this parent method??                        
    def curate_activityStage2p(self):
        """  Processes insights data from stage1 into stage2 using structured streaming within the defined functions below."""
        logger.info("Processing microsoft_insights data from: " + self.stage2p_insightsActivity)

        # need a more standardized approach for how to process specified-date data
        items = mssparkutils.fs.ls(self.stage2p_insightsActivity)
        for item in items:
            if item.name == "year=2021":
                logger.info("Change the year hardcoded for specific year data if you wish to process more than just 2021 data")
                self._create_oea_standard_activity_table()
                #self._process_activity_stage2p_data()
            elif item.name == "_checkpoints_p":
                logger.info("Ignore processing of the checkpoints from previous curation")
            elif item.name == "_delta_log":
                logger.info("Ignore processing the delta log from the previous module ingestion")
            else:
                logger.info("No defined function for curating this data")
                
        logger.info("Finished curating stage 2 activity data")

    def _create_oea_standard_activity_table(self):
        """ Processes Insights activity data from stage2p and creates the OEA standard activity table, writing back to stage2p. """
        logger.info("Processing activity data from: " + self.stage2p_insightsActivity)
      
        # currently hardcoding in year=2021 for testing purposes
        dfActivity = oea.load_delta('stage2p/M365/TechActivity_pseudo/year=2021')

        """ method for creating Activity Data "Event" table for schema mapping """
        df = dfActivity.select(['SignalId','ActorId_pseudonym','SignalType','StartTime','MeetingDuration','SchemaVersion','AppName'])
        df = df.withColumnRenamed("SignalId", "id")\
                    .withColumnRenamed("ActorId_pseudonym", "actor")\
                    .withColumnRenamed("SignalType", "action")\
                    .withColumnRenamed("StartTime", "eventTime")\
                    .withColumnRenamed("MeetingDuration", "details")\
                    .withColumnRenamed("SchemaVersion", "version")\
                    .withColumnRenamed("AppName", "type")
        
        df = df.withColumn('object', F.lit('MSInsights'))

        df = df.withColumn('year', F.year(F.col('eventTime'))).withColumn('month', F.month(F.col('eventTime')))

        df.write.save(self.stage2p, format='delta', mode='overwrite', partitionBy=['year', 'month'], overwriteSchema='true')

    def _process_activity_stage2p_data(self):
        """ Processes OEA standard activity table from stage2 into stage3 using structured streaming. """
        logger.info("Processing activity data from: " + self.stage2p + '/_OEA')
        activity_spark_schema = oea.to_spark_schema(self.schemas['ActivityEvents'])
        dfActivity = spark.readStream.format('parquet').load(self.stage2p + '/_OEA_ActivityEvents/', header='true', schema=activity_spark_schema)

        # Writing out ActivityEvents table to stage 3p
        if len(dfEvent_pseudo.columns) == 0:
            logger.info('No data to be written to stage3p')
        else:
            query = dfEvent_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage2p + '_OEA_ActivityEvents/_checkpoints_p').partitionBy('curationYearMonth')
            query = query.start(self.stage3p + '/ActivityEvents_pseudo')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            logger.info(query.lastProgress)
  
