# Digital Activity Schema Standard Class Notebook

This schema standardization class notebook outlines the 3 necessary functions for processing module schemas into the OEA digital activity schema standard.

These 3 functions are:

 - get_digital_activity_schema - which defines what the digital activity standard consists of, and how to map other data sources to this OEA standard.
 - reset_digital_activity_processing - which deletes the "digital_activity" folder from stage 2p, allowing you to start over the schema standardization process. 
 - process_digital_activity - which takes in the user-defined schema mapping and source folder, while executing the standardization process to be re-written back to stage 2p.

 Any custom/additional data source schema mappings should be defined here.

In [None]:
class DigitalActivity(BaseOEAModule):
    """
    Currently, package class notebook only contains processing for stage2p data.
     - Reads activity data from stage2p, writes the activity data schema/relationship mapping to stage2p again
     - Then takes this mapping and generalized and method for writing to stage3p
    """

    def __init__(self, source_folder='digital_activity'):
        
        BaseOEAModule.__init__(self, oea, source_folder)
        
        self.stage2p_digitalActivity = self.stage2p + 'digital_activity'

        self.schemas['ActivityEvents'] = [['event_id', 'string', 'no-op'],
                        ['event_type', 'string', 'no-op'],
                        ['event_actor', 'string', 'no-op'],
                        ['event_object', 'string', 'no-op'],
                        ['event_eventTime', 'string', 'no-op'],
                        ['entity_type', 'string', 'no-op'], 
                        ['softwareApplication_version', 'string', 'no-op'], 
                        ['generated_aggregateMeasure_metric_timeOnTask', 'string', 'no-op'], 
                        ['generated_aggregateMeasure_metric_numAccess', 'string', 'no-op'],
                        ['generated_aggregateMeasure_metric_used', 'string', 'no-op'],
                        ['generated_aggregateMeasure_metric_activityReportPeriod', 'string', 'no-op']]

        self.schemasDetail = {}
        self.schemasDetail['ActivityEvents'] = [['schema_source', 'https://www.imsglobal.org/spec/caliper/v1p2#tooluseevent'],
                        ['event_id','unique ID used as a signal key'],
                        ['event_type', 'type of activity event'],
                        ['event_actor', 'student or teacher that created the signal'],
                        ['event_object', 'entity that comprises the object of the interaction'],
                        ['event_eventTime', 'date/timestamp of the activity signal'],
                        ['entity_type', 'value that describes the properties of the user agent hosting this SoftwareApplication.'],
                        ['softwareApplication_version', 'value that describes the properties of the user agent hosting this SoftwareApplication.'],
                        ['generated_aggregateMeasure_metric_timeOnTask', 'time on task in seconds'],
                        ['generated_aggregateMeasure_metric_numAccess', 'number of accesses'], 
                        ['generated_aggregateMeasure_metric_used', 'used true or false'], 
                        ['generated_aggregateMeasure_metric_activityReportPeriod', 'activity data collected is reported over this number of days']]


    def get_digital_activity_schema(self):
        """ Get information on digital activity schema
            - needed to align schemas to activity data source
        """
        print("OEA Standard Digital Activity Schema:\n")
        
        print("Columns and data types:\n")
        for var in self.schemas['ActivityEvents']:
            print(var)
        
        print("\nColumn descriptions:\n")
        for var in self.schemasDetail['ActivityEvents']:
            print(var)


    def reset_digital_activity_processing(self):
        """ Resets all data. This is intended for use during initial testing - use with caution.
            - deletes the delta table at stage2/digital_activity
        """
        oea.rm_if_exists(self.stage2p_digitalActivity)
        logger.info(f"Deleted {self.stage2p_digitalActivity}")

    
    def process_digital_activity(self,source_path, schemaMapping):
        """ Processes digital activity data into standardized table with standard activity schema.
            - path: Storage locatoin and directory name (ie stage2p/M365/TechActivity_pseudo)
            - schemaMapping: mapping of table columns to ActivityEvents schema, NULL if no mapping provided
        """
        from pyspark.sql.functions import lit
        
        logger.info("Processing digital activity data from: " + source_path)
      
        dfActivity = oea.load_delta(source_path)

        df = spark.createDataFrame(schemaMapping, schema = ["schema", "source"])
        df = df.na.drop("any")

        obj = df.filter(df['schema'] == "event_object").collect()[0][1]
        dfCols = df.filter(df['schema'] != "event_object")

        colList = dfCols.select('source').collect()
        colList = [col.source for col in colList]

        df = dfActivity.select(colList)

        # rename source column to be schema column
        for row in dfCols.rdd.collect():
            schemaCol = row[0]
            sourceCol = row[1]
            df = df.withColumnRenamed(sourceCol, schemaCol)
            df = df.withColumn(schemaCol, df[schemaCol].cast(StringType()))
                    
        df = df.withColumn('event_object', F.lit(obj))
        df = df.withColumn('year', F.year(F.col('event_eventTime'))).withColumn('month', F.month(F.col('event_eventTime')))
        #df.write.save(self.stage2p, format='delta', mode='append', partitionBy=['year', 'month'], mergeSchema='true')
        df.write.save(oea.path('stage2p', directory_path="digital_activity"), format='delta', mode='append', partitionBy=['year', 'month'], mergeSchema='true')
        #df.write.save(oea.path(self.stage2p_digitalActivity, format='delta', mode='append', partitionBy=['year', 'month'], mergeSchema='true')
        
        logger.info("Complete processing from: " + source_path)
