# GraphAPI_py

This OEA Graph API Module python class provides:
- Data schema definitions
- Data pseudonomization settings
- Data processing for Stage 1np data to Stage 2p and 2np
  * Flattening of JSON files into an expected table format

In [None]:
import datetime

class GraphAPI(BaseOEAModule):
    def __init__(self, source_folder='graph_api'):
        BaseOEAModule.__init__(self, source_folder)

        self.stage1np_graphapi_users = self.stage1np + '/users'
        self.stage1np_graphapi_m365 = self.stage1np + '/m365_app_user_detail'
        self.stage1np_graphapi_teams = self.stage1np + '/teams_activity_user_detail'
        self.stage1np_graphapi_meetings = self.stage1np + '/meeting_attendance_report'

        self.schemas['users'] = [['surname', 'string', 'mask'],
                                ['givenName', 'string', 'mask'],
                                ['userPrincipalName', 'string', 'hash'],
                                ['id', 'string', 'mask'],
                                ['reportYearMonth', 'string', 'partition-by']]

        self.schemas['m365'] = [['reportRefreshDate', 'date', 'no-op'],
                                ['userPrincipalName', 'string', 'hash'],
                                ['lastActivationDate', 'date', 'no-op'],
                                ['lastActivityDate', 'date', 'no-op'],
                                ['reportPeriod', 'string', 'no-op'],
                                ['mobile', 'boolean', 'no-op'],
                                ['web', 'boolean', 'no-op'],
                                ['mac', 'boolean', 'no-op'],
                                ['windows', 'boolean', 'no-op'],
                                ['excel', 'boolean', 'no-op'],
                                ['excelMac', 'boolean', 'no-op'],
                                ['excelMobile', 'boolean', 'no-op'],
                                ['excelWeb', 'boolean', 'no-op'],
                                ['excelWindows', 'boolean', 'no-op'],
                                ['oneNote', 'boolean', 'no-op'],
                                ['oneNoteMac', 'boolean', 'no-op'],
                                ['oneNoteMobile', 'boolean', 'no-op'],
                                ['oneNoteWeb', 'boolean', 'no-op'],
                                ['oneNoteWindows', 'boolean', 'no-op'],
                                ['outlook', 'boolean', 'no-op'],
                                ['outlookMac', 'boolean', 'no-op'],
                                ['outlookMobile', 'boolean', 'no-op'],
                                ['outlookWeb', 'boolean', 'no-op'],
                                ['outlookWindows', 'boolean', 'no-op'],
                                ['powerPoint', 'boolean', 'no-op'],
                                ['powerPointMac', 'boolean', 'no-op'],
                                ['powerPointMobile', 'boolean', 'no-op'],
                                ['powerPointWeb', 'boolean', 'no-op'],
                                ['powerPointWindows', 'boolean', 'no-op'],
                                ['teams', 'boolean', 'no-op'],
                                ['teamsMac', 'boolean', 'no-op'],
                                ['teamsMobile', 'boolean', 'no-op'],
                                ['teamsWeb', 'boolean', 'no-op'],
                                ['teamsWindows', 'boolean', 'no-op'],
                                ['word', 'boolean', 'no-op'],
                                ['wordMac', 'boolean', 'no-op'],
                                ['wordMobile', 'boolean', 'no-op'],
                                ['wordWeb', 'boolean', 'no-op'],
                                ['wordWindows', 'boolean', 'no-op'],
                                ['reportYearMonth', 'string', 'partition-by']]

        self.schemas['teams'] = [['reportRefreshDate', 'date', 'no-op'],
                                ['lastActivityDate', 'date', 'no-op'],
                                ['deletedDate', 'string', 'no-op'],
                                ['isDeleted', 'boolean', 'no-op'],
                                ['isLicensed', 'boolean', 'no-op'], 
                                ['reportPeriod', 'string', 'no-op'],
                                ['userPrincipalName', 'string', 'hash'],
                                ['privateChatMessageCount', 'integer', 'no-op'],
                                ['teamChatMessageCount', 'integer', 'no-op'],
                                ['meetingsAttendedCount', 'integer', 'no-op'],
                                ['meetingCount', 'integer', 'no-op'],
                                ['meetingsOrganizedCount', 'integer', 'no-op'],                        
                                ['callCount', 'integer', 'no-op'],
                                ['audioDuration', 'integer', 'no-op'],
                                ['videoDuration', 'integer', 'no-op'],
                                ['screenShareDuration', 'integer', 'no-op'],                        
                                ['scheduledOneTimeMeetingsAttendedCount', 'integer', 'no-op'],
                                ['scheduledOneTimeMeetingsOrganizedCount', 'integer', 'no-op'],
                                ['scheduledRecurringMeetingsAttendedCount', 'integer', 'no-op'],
                                ['scheduledRecurringMeetingsOrganizedCount', 'integer', 'no-op'],
                                ['adHocMeetingsAttendedCount', 'integer', 'no-op'],
                                ['adHocMeetingsOrganizedCount', 'integer', 'no-op'],
                                ['assignedProducts', 'string', 'no-op'],
                                ['hasOtherAction', 'boolean', 'no-op'],
                                ['reportYearMonth', 'string', 'partition-by']]

        self.schemas['meetings'] = [['meetingId', 'string', 'no-op'],
                                ['totalParticipantCount', 'integer', 'no-op'],
                                ['meetingStartDateTime', 'timestamp', 'no-op'],
                                ['meetingEndDateTime', 'timestamp', 'no-op'],
                                ['userEmailAddress', 'string', 'hash'], 
                                ['totalAttendanceInSec', 'integer', 'no-op'],
                                ['role', 'string', 'no-op'],
                                ['userId', 'string', 'hash'],
                                ['userDisplayName', 'string', 'mask'],
                                ['userTenantId', 'string', 'no-op'],
                                ['attendanceInterval_joinDateTime', 'timestamp', 'no-op'],
                                ['attendanceInterval_leaveDateTime', 'timestamp', 'no-op'],                        
                                ['attendanceInterval_durationInSec', 'integer', 'no-op'],
                                ['year', 'integer', 'partition-by']]
    
    def ingest(self):
        """ Processes graphapi data from stage1 into stage2 using structured streaming within the defined functions below. """
        logger.info("Processing microsoft_graph data from: " + self.stage1np)

        items = mssparkutils.fs.ls(self.stage1np)
        for item in items:
            if item.name == "users":
                self._process_graphapi_users_stage1_data()
            elif item.name == "m365_app_user_detail":
                self._process_graphapi_m365_stage1_data()
            elif item.name == "teams_activity_user_detail":
                self._process_graphapi_teams_stage1_data()
            elif item.name == "meeting_attendance_report":
                self._process_graphapi_meetings_stage1_data()
            else:
                logger.info("No defined function for processing this queried data")
        
        logger.info("Finished processing graphapi data from stage 1 to stage 2")

    def _process_graphapi_users_stage1_data(self):
        """ Processes users data from stage1 into stage2 using structured streaming. """
        logger.info("Processing microsoft_graph users data from: " + self.stage1np_graphapi_users)

        spark.sql("set spark.sql.streaming.schemaInference=true")
        # read in the raw data, and explode the "value" array
        df = spark.readStream.format('json').load(self.stage1np_graphapi_users + '/*/*.json', header='true')
        df = df.select(F.explode('value').alias('exploded_values')).select("exploded_values.*")
        # grab the current date for partitioning the data later (in stage 2 folders)
        currentDate = datetime.datetime.now()
        currentYearMonth = currentDate.strftime('%Y-%m')
            # create a new column for partitioning the folder structure
        df = df.withColumn('ReportYearMonth', F.lit(currentYearMonth))
        # use the users_spark_schema for pseudonymization
        users_spark_schema = oea.to_spark_schema(self.schemas['users'])
        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['users'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_users + '/_checkpoints_p').partitionBy('ReportYearMonth')
            query = query.start(self.stage2p + '/users_pseudo')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_users + '/_checkpoints_np').partitionBy('ReportYearMonth')
            query2 = query2.start(self.stage2np + '/users_lookup')
            query2.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.

    def _process_graphapi_m365_stage1_data(self):
        """ Processes m365 data from stage1 into stage2 using structured streaming. """
        logger.info("Processing microsoft_graph m365 data from: " + self.stage1np_graphapi_m365)
        
        spark.sql("set spark.sql.streaming.schemaInference=true")
        # read in the raw data, and explode the "value" and "details" arrays
        df = spark.readStream.format('json').load(self.stage1np_graphapi_m365 + '/*/*.json', header='true')
        df = df.select(F.explode('value').alias('exploded_values')).select("exploded_values.*")
        df = df.withColumn('reportPeriod', F.explode(F.col('details').reportPeriod)) \
                        .withColumn('mobile', F.explode(F.col('details').mobile)) \
                        .withColumn('web', F.explode(F.col('details').web)) \
                        .withColumn('mac', F.explode(F.col('details').mac)) \
                        .withColumn('windows', F.explode(F.col('details').windows)) \
                        .withColumn('excel', F.explode(F.col('details').excel)) \
                        .withColumn('excelMobile', F.explode(F.col('details').excelMobile)) \
                        .withColumn('excelWeb', F.explode(F.col('details').excelWeb)) \
                        .withColumn('excelMac', F.explode(F.col('details').excelMac)) \
                        .withColumn('excelWindows', F.explode(F.col('details').excelWindows)) \
                        .withColumn('oneNote', F.explode(F.col('details').oneNote)) \
                        .withColumn('oneNoteMobile', F.explode(F.col('details').oneNoteMobile)) \
                        .withColumn('oneNoteWeb', F.explode(F.col('details').oneNoteWeb)) \
                        .withColumn('oneNoteMac', F.explode(F.col('details').oneNoteMac)) \
                        .withColumn('oneNoteWindows', F.explode(F.col('details').oneNoteWindows)) \
                        .withColumn('outlook', F.explode(F.col('details').outlook)) \
                        .withColumn('outlookMobile', F.explode(F.col('details').outlookMobile)) \
                        .withColumn('outlookWeb', F.explode(F.col('details').outlookWeb)) \
                        .withColumn('outlookMac', F.explode(F.col('details').outlookMac)) \
                        .withColumn('outlookWindows', F.explode(F.col('details').outlookWindows)) \
                        .withColumn('powerPoint', F.explode(F.col('details').powerPoint)) \
                        .withColumn('powerPointMobile', F.explode(F.col('details').powerPointMobile)) \
                        .withColumn('powerPointWeb', F.explode(F.col('details').powerPointWeb)) \
                        .withColumn('powerPointMac', F.explode(F.col('details').powerPointMac)) \
                        .withColumn('powerPointWindows', F.explode(F.col('details').powerPointWindows)) \
                        .withColumn('teams', F.explode(F.col('details').teams)) \
                        .withColumn('teamsMobile', F.explode(F.col('details').teamsMobile)) \
                        .withColumn('teamsWeb', F.explode(F.col('details').teamsWeb)) \
                        .withColumn('teamsMac', F.explode(F.col('details').teamsMac)) \
                        .withColumn('teamsWindows', F.explode(F.col('details').teamsWindows)) \
                        .withColumn('word', F.explode(F.col('details').word)) \
                        .withColumn('wordMobile', F.explode(F.col('details').wordMobile)) \
                        .withColumn('wordWeb', F.explode(F.col('details').wordWeb)) \
                        .withColumn('wordMac', F.explode(F.col('details').wordMac)) \
                        .withColumn('wordWindows', F.explode(F.col('details').wordWindows)) \
                        .drop('details')
        # change columns with dates to be of date types
        df.select(F.col('reportRefreshDate'), F.to_date(F.col('reportRefreshDate'), 'yyyy-MM-dd'))
        df.select(F.col('lastActivityDate'), F.to_date(F.col('lastActivityDate'), 'yyyy-MM-dd'))
        df.select(F.col('lastActivationDate'), F.to_date(F.col('lastActivationDate'), 'yyyy-MM-dd'))
        # grab the current date for partitioning the data later (in stage 2 folders)
        currentDate = datetime.datetime.now()
        currentYearMonth = currentDate.strftime('%Y-%m')
            # create a new column for partitioning the folder structure
        df = df.withColumn('ReportYearMonth', F.lit(currentYearMonth))
        # use the m365_spark_schema for pseudonymization
        m365_spark_schema = oea.to_spark_schema(self.schemas['m365'])
        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['m365'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_m365 + '/_checkpoints_p').partitionBy('ReportYearMonth')
            query = query.start(self.stage2p + '/m365_app_user_detail_pseudo')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_m365 + '/_checkpoints_np').partitionBy('ReportYearMonth')
            query2 = query2.start(self.stage2np + '/m365_app_user_detail_lookup')
            query2.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs. 

    def _process_graphapi_teams_stage1_data(self):
        """ Processes teams data from stage1 into stage2 using structured streaming. """
        logger.info("Processing microsoft_graph teams data from: " + self.stage1np_graphapi_teams)

        spark.sql("set spark.sql.streaming.schemaInference=true")
        # read in the raw data, and explode the "value" and "assignedProducts" arrays 
        df = spark.readStream.format('json').load(self.stage1np_graphapi_teams + '/*/*.json', header='true')
        df = df.select(F.explode('value').alias('exploded_values')).select("exploded_values.*")
        df = df.withColumn('assignedProducts', F.explode(F.col('assignedProducts')))
            # convert duration to seconds only 
            # NOTE: The duration expression may have changed and this will need to be modified to accommodate any new duration formatting
        df = df.withColumn(
            'screenShareDuration', 
            F.coalesce(F.regexp_extract('screenShareDuration', r'(\d+)H', 1).cast('int'), F.lit(0)) * 3600 + 
            F.coalesce(F.regexp_extract('screenShareDuration', r'(\d+)M', 1).cast('int'), F.lit(0)) * 60 + 
            F.coalesce(F.regexp_extract('screenShareDuration', r'(\d+)S', 1).cast('int'), F.lit(0))
            ).withColumn(
            'videoDuration', 
            F.coalesce(F.regexp_extract('videoDuration', r'(\d+)H', 1).cast('int'), F.lit(0)) * 3600 + 
            F.coalesce(F.regexp_extract('videoDuration', r'(\d+)M', 1).cast('int'), F.lit(0)) * 60 + 
            F.coalesce(F.regexp_extract('videoDuration', r'(\d+)S', 1).cast('int'), F.lit(0))
            ).withColumn(
            'audioDuration', 
            F.coalesce(F.regexp_extract('audioDuration', r'(\d+)H', 1).cast('int'), F.lit(0)) * 3600 + 
            F.coalesce(F.regexp_extract('audioDuration', r'(\d+)M', 1).cast('int'), F.lit(0)) * 60 + 
            F.coalesce(F.regexp_extract('audioDuration', r'(\d+)S', 1).cast('int'), F.lit(0))
            )
        # change columns with dates to be of date types
        df.select(F.col('reportRefreshDate'), F.to_date(F.col('reportRefreshDate'), 'yyyy-MM-dd'))
        df.select(F.col('lastActivityDate'), F.to_date(F.col('lastActivityDate'), 'yyyy-MM-dd'))
        # uncomment this code when using actual data, since this will be null in the test data
        #df.select(F.col('deletedDate'), F.to_date(F.col('deletedDate'), 'yyyy-MM-dd'))
        # grab the current date for partitioning the data later (in stage 2 folders)
        currentDate = datetime.datetime.now()
        currentYearMonth = currentDate.strftime('%Y-%m')
            # create a new column for partitioning the folder structure
        df = df.withColumn('ReportYearMonth', F.lit(currentYearMonth))
        # use the teams_spark_schema for pseudonymization
        teams_spark_schema = oea.to_spark_schema(self.schemas['teams'])
        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['teams'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_teams + '/_checkpoints_p').partitionBy('ReportYearMonth')
            query = query.start(self.stage2p + '/teams_activity_user_detail_pseudo')
            query.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_teams + '/_checkpoints_np').partitionBy('ReportYearMonth')
            query2 = query2.start(self.stage2np + '/teams_activity_user_detail_lookup')
            query2.awaitTermination()   # block until query is terminated, with stop() or with error; A StreamingQueryException will be thrown if an exception occurs.

    def _process_graphapi_meetings_stage1_data(self):
        """ Processes meeting attendance report data from stage1 into stage2 using structured streaming. """
        logger.info("Processing microsoft_graph meetings data from: " + self.stage1np_graphapi_meetings)

        spark.sql("set spark.sql.streaming.schemaInference=true")
        # read in the raw data, drop irrelevant data, and rename columns.
        # then, explode the "attendanceRecords", "identity", and "attendanceIntervals" arrays
        # NOTE: the multiLine option is currently set as true for the test data, if this does not match the test data, remove this parameter
        df = spark.readStream.format('json').load(self.stage1np_graphapi_meetings + '/*/*.json', header='true', multiLine='true')
        #df = df.drop('@odata.context')
        # flatten the original nested JSON file format
        dfFlat = df.select(
            "id", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount",
            F.explode("attendanceRecords").alias("attendanceRecordsExplode")
        ).select("id", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
                    "attendanceRecordsExplode.*")

        dfFlat = dfFlat.withColumnRenamed("id","meetingId")

        dfFlat = dfFlat.select(
            "meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
            "totalAttendanceInSeconds", "role", "emailAddress", "attendanceIntervals",
            F.explode(F.array("identity")).alias("identityExplode")
        ).select("meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
            "totalAttendanceInSeconds", "role", "emailAddress","attendanceIntervals",
                "identityExplode.*")

        dfFlat = dfFlat.select(
            "meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
            "totalAttendanceInSeconds", "role", "emailAddress",
            "displayName", "id", "tenantId",
            F.explode("attendanceIntervals").alias("attendanceIntervalsExplode")
        ).select("meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
            "totalAttendanceInSeconds", "role", "emailAddress",
            "attendanceIntervalsExplode.*",
            "displayName", "id", "tenantId")

        # clean up the column names and data types
        dfFlat = dfFlat.withColumnRenamed("id", "userId").withColumnRenamed("displayName", "userDisplayName").withColumnRenamed("emailAddress", "userEmailAddress") \
                .withColumnRenamed("totalAttendanceInSeconds", "totalAttendanceInSec").withColumnRenamed("tenantId", "userTenantId") \
                .withColumnRenamed("joinDateTime", "attendanceInterval_joinDateTime").withColumnRenamed("leaveDateTime", "attendanceInterval_leaveDateTime").withColumnRenamed("durationInSeconds", "attendanceInterval_durationInSec")
        dfFlat = dfFlat.withColumn('meetingStartDateTime', F.to_timestamp(F.col('meetingStartDateTime'))) \
                .withColumn('meetingEndDateTime', F.to_timestamp(F.col('meetingEndDateTime'))) \
                .withColumn('attendanceInterval_joinDateTime', F.to_timestamp(F.col('attendanceInterval_joinDateTime'))) \
                .withColumn('attendanceInterval_leaveDateTime', F.to_timestamp(F.col('attendanceInterval_leaveDateTime')))
        
        # use the meetingEndDateTime column for partitioning the files and pseudonymize
        df = dfFlat.withColumn('year', F.year(F.col('meetingEndDateTime')))
        df_pseudo, df_lookup = oea.pseudonymize(df, self.schemas['meetings'])

        if len(df_pseudo.columns) == 0:
            logger.info('No data to be written to stage2p')
        else:
            query = df_pseudo.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_meetings + '/_checkpoints_p').partitionBy('year')
            query = query.start(self.stage2p + '/meeting_attendance_report_pseudo')
            query.awaitTermination()
        
        if len(df_lookup.columns) == 0:
            logger.info('No data to be written to stage2np')
        else:
            query2 = df_lookup.writeStream.format("delta").outputMode("append").trigger(once=True).option("checkpointLocation", self.stage1np_graphapi_meetings + '/_checkpoints_np').partitionBy('year')
            query2 = query2.start(self.stage2np + '/meeting_attendance_report_lookup')
            query2.awaitTermination()
        