# Graph Module Ingestion - Schema Correction

This notebook demonstrates the utility of the OEA_py class notebook, while correcting module tables initially ingested as un-flattened, with incorrect column data types, and lacking unique primary keys.

Tables are read from ```stage2/Ingested/graph_api/(beta or v1.0)``` and written out, with the corrected schema, to ```stage2/Ingested_Corrected/graph_api/(beta or v1.0)```

The steps outlined below describe how this notebook is used to correct the Microsoft Graph Reports API module tables:
- Set the workspace for where the table schemas are to be corrected. 
- 5 functions are defined and used:
   1. **_correct_users_table**: flattens the users table.
   2. **_correct_m365_table**: flattens, corrects some column dtypes, and adds a primary key per row for the m365_app_user_detail table.
   3. **_correct_teams_table**: flattens, corrects some column dtypes, and adds a primary key per row some column dtypes for the teams_activity_user_detail table.
   4. **_correct_meetings_table**: flattens, corrects some column dtypes, and adds a primary key per row for the meeting_attendance_report table.
   5. **correct_graph_dataset**: extracts the names of all the folders currently stored in stage2/Ingested/graph_api, corrects the schema per table using the functions above, and overwrites the tables with the updated schemas.

In [1]:
workspace = 'dev'
testdataSet = 'hed'

In [2]:
%run OEA_py

In [3]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace(workspace)

In [8]:
# 2) schema correction, since Graph data initially landed is unstructured with nested arrays, incorrect column dtypes, and without primary keys for 3 of the 4 tables.
def _correct_users_table(df):
    df_flat = df.select(F.explode('value').alias('exploded_values')).select("exploded_values.*")
    return df_flat

def _correct_m365_table(df):
    df_flat = df.select(F.explode('value').alias('exploded_values')).select("exploded_values.*")
    df_flat = df_flat.withColumn('reportPeriod', F.explode(F.col('details').reportPeriod)) \
                    .withColumn('mobile', F.explode(F.col('details').mobile)) \
                    .withColumn('web', F.explode(F.col('details').web)) \
                    .withColumn('mac', F.explode(F.col('details').mac)) \
                    .withColumn('windows', F.explode(F.col('details').windows)) \
                    .withColumn('excel', F.explode(F.col('details').excel)) \
                    .withColumn('excelMobile', F.explode(F.col('details').excelMobile)) \
                    .withColumn('excelWeb', F.explode(F.col('details').excelWeb)) \
                    .withColumn('excelMac', F.explode(F.col('details').excelMac)) \
                    .withColumn('excelWindows', F.explode(F.col('details').excelWindows)) \
                    .withColumn('oneNote', F.explode(F.col('details').oneNote)) \
                    .withColumn('oneNoteMobile', F.explode(F.col('details').oneNoteMobile)) \
                    .withColumn('oneNoteWeb', F.explode(F.col('details').oneNoteWeb)) \
                    .withColumn('oneNoteMac', F.explode(F.col('details').oneNoteMac)) \
                    .withColumn('oneNoteWindows', F.explode(F.col('details').oneNoteWindows)) \
                    .withColumn('outlook', F.explode(F.col('details').outlook)) \
                    .withColumn('outlookMobile', F.explode(F.col('details').outlookMobile)) \
                    .withColumn('outlookWeb', F.explode(F.col('details').outlookWeb)) \
                    .withColumn('outlookMac', F.explode(F.col('details').outlookMac)) \
                    .withColumn('outlookWindows', F.explode(F.col('details').outlookWindows)) \
                    .withColumn('powerPoint', F.explode(F.col('details').powerPoint)) \
                    .withColumn('powerPointMobile', F.explode(F.col('details').powerPointMobile)) \
                    .withColumn('powerPointWeb', F.explode(F.col('details').powerPointWeb)) \
                    .withColumn('powerPointMac', F.explode(F.col('details').powerPointMac)) \
                    .withColumn('powerPointWindows', F.explode(F.col('details').powerPointWindows)) \
                    .withColumn('teams', F.explode(F.col('details').teams)) \
                    .withColumn('teamsMobile', F.explode(F.col('details').teamsMobile)) \
                    .withColumn('teamsWeb', F.explode(F.col('details').teamsWeb)) \
                    .withColumn('teamsMac', F.explode(F.col('details').teamsMac)) \
                    .withColumn('teamsWindows', F.explode(F.col('details').teamsWindows)) \
                    .withColumn('word', F.explode(F.col('details').word)) \
                    .withColumn('wordMobile', F.explode(F.col('details').wordMobile)) \
                    .withColumn('wordWeb', F.explode(F.col('details').wordWeb)) \
                    .withColumn('wordMac', F.explode(F.col('details').wordMac)) \
                    .withColumn('wordWindows', F.explode(F.col('details').wordWindows)) \
                    .drop('details')
    # temp: add unique primary key per row, by combining UPNs with the date the report was generated
    # this assumes every person has only one row per reportRefreshDate
    df_flat = df_flat.withColumn('m365Activity_pk', F.concat(F.col('userPrincipalName'),F.lit('_'),F.col('reportRefreshDate')))

    df_flat.select(F.col('reportRefreshDate'), F.to_date(F.col('reportRefreshDate'), 'yyyy-MM-dd'))
    df_flat.select(F.col('lastActivityDate'), F.to_date(F.col('lastActivityDate'), 'yyyy-MM-dd'))
    df_flat.select(F.col('lastActivationDate'), F.to_date(F.col('lastActivationDate'), 'yyyy-MM-dd'))
    return df_flat

def _correct_teams_table(df):
    df_flat = df.select(F.explode('value').alias('exploded_values')).select("exploded_values.*")
    df_flat = df_flat.withColumn('assignedProducts', F.explode(F.col('assignedProducts')))
    # convert duration to seconds only 
    # NOTE: The duration expression may have changed and this will need to be modified to accommodate any new duration formatting
    df_flat = df_flat.withColumn(
        'screenShareDuration', 
        F.coalesce(F.regexp_extract('screenShareDuration', r'(\d+)H', 1).cast('int'), F.lit(0)) * 3600 + 
        F.coalesce(F.regexp_extract('screenShareDuration', r'(\d+)M', 1).cast('int'), F.lit(0)) * 60 + 
        F.coalesce(F.regexp_extract('screenShareDuration', r'(\d+)S', 1).cast('int'), F.lit(0))
        ).withColumn(
        'videoDuration', 
        F.coalesce(F.regexp_extract('videoDuration', r'(\d+)H', 1).cast('int'), F.lit(0)) * 3600 + 
        F.coalesce(F.regexp_extract('videoDuration', r'(\d+)M', 1).cast('int'), F.lit(0)) * 60 + 
        F.coalesce(F.regexp_extract('videoDuration', r'(\d+)S', 1).cast('int'), F.lit(0))
        ).withColumn(
        'audioDuration', 
        F.coalesce(F.regexp_extract('audioDuration', r'(\d+)H', 1).cast('int'), F.lit(0)) * 3600 + 
        F.coalesce(F.regexp_extract('audioDuration', r'(\d+)M', 1).cast('int'), F.lit(0)) * 60 + 
        F.coalesce(F.regexp_extract('audioDuration', r'(\d+)S', 1).cast('int'), F.lit(0))
        )
    # temp: add unique primary key per row, by combining UPNs with the date the report was generated
    # this assumes every person has only one row per reportRefreshDate
    df_flat = df_flat.withColumn('teamsActivity_pk', F.concat(F.col('userPrincipalName'),F.lit('_'),F.col('reportRefreshDate')))

    df_flat.select(F.col('reportRefreshDate'), F.to_date(F.col('reportRefreshDate'), 'yyyy-MM-dd'))
    df_flat.select(F.col('lastActivityDate'), F.to_date(F.col('lastActivityDate'), 'yyyy-MM-dd'))
    return df_flat

def _correct_meetings_table(df):
    df_flat = df.select(
        "id", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount",
        F.explode("attendanceRecords").alias("attendanceRecordsExplode")
        ).select("id", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
                "attendanceRecordsExplode.*")
    df_flat = df_flat.withColumnRenamed("id","meetingId")
    df_flat = df_flat.select(
        "meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
        "totalAttendanceInSeconds", "role", "emailAddress", "attendanceIntervals",
        F.explode(F.array("identity")).alias("identityExplode")
        ).select("meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
            "totalAttendanceInSeconds", "role", "emailAddress","attendanceIntervals",
            "identityExplode.*")

    df_flat = df_flat.select(
        "meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
        "totalAttendanceInSeconds", "role", "emailAddress",
        "displayName", "id", "tenantId",
        F.explode("attendanceIntervals").alias("attendanceIntervalsExplode")
        ).select("meetingId", "meetingEndDateTime", "meetingStartDateTime", "totalParticipantCount", 
            "totalAttendanceInSeconds", "role", "emailAddress",
            "attendanceIntervalsExplode.*",
            "displayName", "id", "tenantId")
    # clean up column names and timestamp types
    df_flat = df_flat.withColumnRenamed("id", "userId").withColumnRenamed("displayName", "userDisplayName").withColumnRenamed("emailAddress", "userEmailAddress") \
        .withColumnRenamed("totalAttendanceInSeconds", "totalAttendanceInSec").withColumnRenamed("tenantId", "userTenantId") \
        .withColumnRenamed("joinDateTime", "attendanceInterval_joinDateTime").withColumnRenamed("leaveDateTime", "attendanceInterval_leaveDateTime").withColumnRenamed("durationInSeconds", "attendanceInterval_durationInSec")
    df_flat = df_flat.withColumn('meetingStartDateTime', F.to_timestamp(F.col('meetingStartDateTime'))) \
                .withColumn('meetingEndDateTime', F.to_timestamp(F.col('meetingEndDateTime'))) \
                .withColumn('attendanceInterval_joinDateTime', F.to_timestamp(F.col('attendanceInterval_joinDateTime'))) \
                .withColumn('attendanceInterval_leaveDateTime', F.to_timestamp(F.col('attendanceInterval_leaveDateTime')))
    # temp: add unique primary key per row, by combining meeting IDs with user IDs
    # this assumes every person attending a meeting has only one attendance interval
    df_flat = df_flat.withColumn('meetingUserId_pk', F.concat(F.col('meetingId'),F.col('userId')))
    df_flat = df_flat.select(
        'meetingUserId_pk','meetingId','meetingStartDateTime','meetingEndDateTime','totalParticipantCount','userId','userDisplayName','userEmailAddress',
        'userTenantId','role','totalAttendanceInSec','attendanceInterval_joinDateTime','attendanceInterval_leaveDateTime','attendanceInterval_durationInSec'
        )
    return df_flat

def correct_graph_dataset(tables_source, write_destination):
    items = oea.get_folders(tables_source)
    for item in items: 
        table_path = tables_source +'/'+ item
        if item == 'metadata.csv':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        elif item == 'users':
            spark.sql("set spark.sql.streaming.schemaInference=true")
            streaming_df_users = spark.readStream.format('delta').load(oea.to_url(table_path))
            df_corrected = _correct_users_table(streaming_df_users)
            query = df_corrected.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(table_path) + '/_checkpoints')
            query = query.start(oea.to_url(write_destination + '/' +item))
            query.awaitTermination() 
            logger.info('Successfully corrected the users table from: ' + table_path)
        elif item == 'm365_app_user_detail':
            spark.sql("set spark.sql.streaming.schemaInference=true")
            streaming_df_m365 = spark.readStream.format('delta').load(oea.to_url(table_path))
            df_corrected = _correct_m365_table(streaming_df_m365)
            query = df_corrected.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(table_path) + '/_checkpoints')
            query = query.start(oea.to_url(write_destination + '/' + item))
            query.awaitTermination() 
            logger.info('Successfully corrected the m365_app_user_detail table from: ' + table_path)
        elif item == 'teams_activity_user_detail':
            spark.sql("set spark.sql.streaming.schemaInference=true")
            streaming_df_teams = spark.readStream.format('delta').load(oea.to_url(table_path))
            df_corrected = _correct_teams_table(streaming_df_teams)
            query = df_corrected.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(table_path) + '/_checkpoints')
            query = query.start(oea.to_url(write_destination + '/' + item))
            query.awaitTermination() 
            logger.info('Successfully corrected the teams_activity_user_detail table from: ' + table_path)
        elif item == 'meeting_attendance_report':
            spark.sql("set spark.sql.streaming.schemaInference=true")
            streaming_df_meetings = spark.readStream.format('delta').load(oea.to_url(table_path))
            df_corrected = _correct_meetings_table(streaming_df_meetings)
            query = df_corrected.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(table_path) + '/_checkpoints')
            query = query.start(oea.to_url(write_destination + '/' + item))
            query.awaitTermination() 
            logger.info('Successfully corrected the meeting_attendance_report table from: ' + table_path)
        else:
            logger.info('No defined function for table: ' + item)

In [None]:
if testdataSet == 'k12':
    correct_graph_dataset('stage2/Ingested/graph_api/beta', 'stage2/Ingested_Corrected/graph_api/beta')
    logger.info('Finished schema correction for Graph dataset')
elif testdataSet == 'hed':
    correct_graph_dataset('stage2/Ingested/graph_api/beta', 'stage2/Ingested_Corrected/graph_api/beta')
    correct_graph_dataset('stage2/Ingested/graph_api/v1.0', 'stage2/Ingested_Corrected/graph_api/v1.0')
    logger.info('Finished schema correction for Graph dataset')
else:
    logger.info('Unrecognized testdataSet - please choose either k12 or hed.')

In [7]:
df = spark.read.format('delta').load(oea.to_url('stage2/Ingested_Corrected/graph_api/v1.0/meeting_attendance_report'), header='true')
display(df.limit(10))

In [5]:
df.printSchema()