In [None]:
import datetime

class Intune(BaseOEAModule):
    def __init__(self, source_folder='intune'):
        BaseOEAModule.__init__(self, source_folder)

        self.stage1np_devices = self.stage1np + '/devices'

        self.schemas['devices'] = [['DeviceName', 'string', 'no-op'],
                                    ['ManagedBy', 'string', 'no-op'],
                                    ['Ownership', 'string', 'no-op'],
                                    ['Compliance','string','no-op'],
                                    ['OS', 'string', 'no-op'],
                                    ['OSVersion', 'string', 'no-op'],
                                    ['LastCheckIn', 'timestamp', 'no-op'],
                                    ['PrimaryUserUPN', 'string', 'hash'],
                                    ['DeviceID', 'string', 'no-op'],
                                    ['ReportYearMonth', 'string', 'partition-by']]

        self.schemas['devices_enriched'] = [['DeviceName', 'string', 'no-op'],
                                            ['ManagedBy', 'string', 'no-op'],
                                            ['Ownership', 'string', 'no-op'],
                                            ['Compliance','string','no-op'],
                                            ['OS', 'string', 'no-op'],
                                            ['OSVersion', 'string', 'no-op'],
                                            ['LastCheckIn', 'timestamp', 'no-op'],
                                            ['PrimaryUserUPN', 'string', 'hash'],
                                            ['DeviceID', 'string', 'no-op'],
                                            ['LastCheckInDate', 'date', 'no-op'],
                                            ['AccessOutsideOfSchool', 'boolean', 'no-op'],
                                            ['ReportYearMonth', 'string', 'partition-by']]
    
    def ingest(self):
        """ Processes intune data from stage1 into stage2 using structured streaming within the defined function below. """
        logger.info("Processing intune data from: " + self.stage1np)

        items = mssparkutils.fs.ls(self.stage1np)
        for item in items:
            # this module only provides defined processing for the intune devices report
            if item.name == "devices":
                self._process_intune_devices_stage1_data()
                self._process_intune_devices_stage1_data_for_refined()
            else:
                logger.info("No defined function for processing this queried data")
        
        logger.info("Finished processing graphapi data from stage 1 to stage 2")

    def _process_intune_devices_stage1_data(self):
        """ Process intune devices data from stage 1 without data enrichment """
        source_path = f'{self.stage1np}/devices'
        logger.info("Processing intune devices data without enrichment")

        spark.sql("set spark.sql.streaming.schemaInference=true")
        df = spark.readStream.csv(self.stage1np_devices + '/**/*.csv', header='true')
        #df = df.dropDuplicates(['DeviceName'])
        # rename column names to use camel case (kind of); remove spaces and hyphens
        df = df.withColumnRenamed('Device name', 'DeviceName').withColumnRenamed('Managed by', 'ManagedBy').withColumnRenamed('OS version', 'OSVersion')
        df = df.withColumnRenamed('Last check-in', 'LastCheckIn').withColumnRenamed('Primary user UPN', 'PrimaryUserUPN').withColumnRenamed('Device ID', 'DeviceID')
        # grab the current date for partitioning the data later (in stage 2 folders)
        currentDate = datetime.datetime.now()
        currentYearMonth = currentDate.strftime('%Y-%m')
            # create a new column for partitioning the folder structure
        df = df.withColumn('ReportYearMonth', F.lit(currentYearMonth))
        devices_spark_schema = oea.to_spark_schema(self.schemas['devices'])
        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['devices'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", source_path + '/_checkpoints_p').partitionBy('ReportYearMonth')
            query = query.start(self.stage2p + '/devices_pseudo')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            logger.info(query.lastProgress)
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", source_path + '/_checkpoints_np').partitionBy('ReportYearMonth')
            query2 = query2.start(self.stage2np + '/devices_lookup')
            query2.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            logger.info(query2.lastProgress)       

    def _process_intune_devices_stage1_data_for_refined(self):
        """ Processes intune devices data from stage 1 with added columns"""
        source_path = f'{self.stage1np}/devices'
        logger.info("Processing intune devices enriched data")
        
        spark.sql("set spark.sql.streaming.schemaInference=true")
        df = spark.readStream.csv(self.stage1np_devices + '/**/*.csv', header='true')
        #df = df.dropDuplicates(['DeviceName'])
        # rename column names to use camel case (kind of); remove spaces and hyphens
        df = df.withColumnRenamed('Device name', 'DeviceName').withColumnRenamed('Managed by', 'ManagedBy').withColumnRenamed('OS version', 'OSVersion')
        df = df.withColumnRenamed('Last check-in', 'LastCheckIn').withColumnRenamed('Primary user UPN', 'PrimaryUserUPN').withColumnRenamed('Device ID', 'DeviceID')
        # creates two columns based on last check in: last check in date and last check in hour-of-day
        df = df.withColumn('LastCheckInTime', F.split(F.col('LastCheckIn'), ' ').getItem(1))
        df = df.withColumn('LastCheckInDate', F.split(F.col('LastCheckIn'), ' ').getItem(0))
        df = df.withColumn('LastCheckInHourOfDay', F.split(F.col('LastCheckInTime'), ':').getItem(0))
        df = df.drop('LastCheckInTime')
            # cast the last check in date column as a date, and create a column identifying the last day of week (Mon, Tues, etc.) 
        df.select(F.col('LastCheckInDate'), F.to_date(F.col('LastCheckInDate'), 'yyyy-MM-dd'))
        df = df.withColumn('LastCheckInDayOfWeek', F.date_format(F.col('LastCheckIn'), "E"))
            # create new column to identify whether the student/device has access outside of school based on the last check in date and hour of day
            # currently identifies if the last check in is on a weekend, before 9 AM, or after 4 PM - then the device is considered to have access outside of school
        df = df.withColumn('AccessOutsideOfSchool', F.when(F.col('LastCheckInDayOfWeek') == "Sat", "true").otherwise(F.when(F.col('LastCheckInDayOfWeek') == "Sun", "true").otherwise(F.when(F.col('LastCheckInHourOfDay') >= 16, "true").otherwise(F.when(F.col('LastCheckInHourOfDay') < 9, "true").otherwise("false")))))
        # Can comment out this drop if you don't want to drop these two columns (note: you will need to update the schema to reflect this, if you comment this out)
        df = df.drop('LastCheckInDayOfWeek').drop('LastCheckInHourOfDay')
        # grab the current date for partitioning the data later (in stage 2 folders)
        currentDate = datetime.datetime.now()
        currentYearMonth = currentDate.strftime('%Y-%m')
            # create a new column for partitioning the folder structure
        df = df.withColumn('ReportYearMonth', F.lit(currentYearMonth))
        devices_spark_schema = oea.to_spark_schema(self.schemas['devices_enriched'])
        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['devices_enriched'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", source_path + '/_checkpoints_p_refined').partitionBy('ReportYearMonth')
            query = query.start(self.stage2p + '/devices_pseudo_refined')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            query.explain()
            logger.info(query.lastProgress)
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", source_path + '/_checkpoints_np_refined').partitionBy('ReportYearMonth')
            query2 = query2.start(self.stage2np + '/devices_lookup_refined')
            query2.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
            logger.info(query2.lastProgress)       
