# Graph API Module Example Notebook

This notebook creates 3 tables (users, m365_app_user_detail and teams_acivity_user_details) into a new Spark database called 2np_graphapi. 


### Provision storage accounts

The storage account variable has to be changed to the name of the storage account associated with your Azure resource group.

In [24]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType
from pyspark.sql.functions import *
from pyspark.sql.window import Window


# data lake and container information
storage_account = 'stoeahybriddev2'
use_test_env = False

if use_test_env:
    stage1np = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage1np'
    stage2np = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage2np'
    stage2p = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage2p'
else:
    stage1np = 'abfss://stage1np@' + storage_account + '.dfs.core.windows.net'
    stage2np = 'abfss://stage2np@' + storage_account + '.dfs.core.windows.net'
    stage2p = 'abfss://stage2p@' + storage_account + '.dfs.core.windows.net'

StatementMeta(spark3p1sm, 59, 24, Finished, Available)

### Load Raw Data from Lake
To ensure that that the right tables are loaded, confirm that the file paths match your data lake storage containers. 

The top code-block defines the schema of how each of the stage 1 JSON files are stored.

In [25]:
# schemas for each of the JSON files for transformation into tables
user_schema = StructType(fields=[
    StructField('value', ArrayType(
        StructType([
            StructField('surname', StringType(), False),
            StructField('givenName', StringType(), False),
            StructField('userPrincipalName', StringType(), False),
            StructField('id', StringType(), False)
        ])
    ))
])

m365_app_user_details_schema = StructType(fields=[
    StructField('value', ArrayType(
        StructType([
            StructField('reportRefreshDate', StringType(), False),
            StructField('userPrincipalName', StringType(), False),
            StructField('lastActivityDate', StringType(), False),
            StructField('details', ArrayType(
                StructType([
                    StructField('reportPeriod', StringType(), False),
                    StructField('excel', StringType(), False),
                    StructField('excelWeb', StringType(), False),
                    StructField('outlook', StringType(), False),
                    StructField('outlookWeb', StringType(), False),
                    StructField('powerPoint', StringType(), False),
                    StructField('powerPointWeb', StringType(), False),
                    StructField('teams', StringType(), False),
                    StructField('teamsWeb', StringType(), False),
                    StructField('word', StringType(), False),
                    StructField('wordWeb', StringType(), False),
                ])
            ))
        ])
    ))
])

teams_activity_user_details_schema = StructType(fields=[
    StructField('value', ArrayType(
        StructType([
            StructField('reportRefreshDate', StringType(), False),
            StructField('reportPeriod', StringType(), False),
            StructField('userPrincipalName', StringType(), False),
            StructField('privateChatMessageCount', IntegerType(), False),
            StructField('teamChatMessageCount', IntegerType(), False),
            StructField('meetingsAttendedCount', IntegerType(), False),
            StructField('meetingCount', IntegerType(), False),
            StructField('audioDuration', StringType(), False),
            StructField('videoDuration', StringType(), False),
        ])
    ))
])

StatementMeta(spark3p1sm, 59, 25, Finished, Available)

In [26]:
# load needed tables from JSON data lake storage
dfUsersRaw = spark.read.format('json').load(f'{stage1np}/GraphAPI/Users/*.json', schema=user_schema)
dfM365UserActivityRaw = spark.read.format('json').load(f'{stage1np}/GraphAPI/M365_App_User_Detail/*.json', schema=m365_app_user_details_schema)
dfTeamsUserActivityRaw = spark.read.format('json').load(f'{stage1np}/GraphAPI/Teams_Activity_User_Detail/*.json', schema=teams_activity_user_details_schema)

StatementMeta(spark3p1sm, 59, 26, Finished, Available)

## 1. Users table
Contains all users (students and teachers) at a school-system level

** Databases and tables used: **

 - None 
 
**JSON files used:**

- users.json

**Database and table created:**

1. Spark DB: s2np_graphapi
- Table: users

In [27]:
dfUsers = dfUsersRaw.select(explode('value').alias('exploded_values')).select("exploded_values.*")
display(dfUsers.limit(10))

StatementMeta(spark3p1sm, 59, 27, Finished, Available)

SynapseWidget(Synapse.DataFrame, b1453f39-1ea2-4e4d-a888-2ef185f78340)

## Write Data Back to Lake

### Writing to Stage 2np

In [28]:
# write back to the lake in stage 2 ds2_main directory
dfUsers.write.format('parquet').mode('overwrite').save(stage2np + '/GraphAPI/Users')

StatementMeta(spark3p1sm, 59, 28, Finished, Available)

### Writing to Stage 2p
Pseudonymizing users data

In [29]:
%run /OEA_py

StatementMeta(, 59, -1, Finished, Available)

In [30]:
oea = OEA()

usersSchema = [['surname', 'string', 'mask'],
                        ['givenName', 'string', 'mask'],
                        ['userPrincipalName', 'string', 'hash'],
                        ['id', 'string', 'no-op']]

df_pseudo, df_lookup = oea.pseudonymize(dfUsers, usersSchema)

df_pseudo.write.format('parquet').mode('overwrite').save(stage2p + '/GraphAPI/Users')

StatementMeta(spark3p1sm, 59, 30, Finished, Available)

2021-10-11 14:45:26,358 - OEA - DEBUG - OEA initialized.
OEA initialized.

### Load to Spark DB

In [31]:
# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"DROP TABLE IF EXISTS {db_name}.users")
    spark.sql(f"create table if not exists {db_name}.users using PARQUET location '{source_path}'")
    
create_spark_db('s2np_graphapi', stage2np + '/GraphAPI/Users')
create_spark_db('s2p_graphapi', stage2p + '/GraphAPI/Users')

StatementMeta(spark3p1sm, 59, 31, Finished, Available)

## Reset Data Processing

In [32]:
def reset_all_processing():
    oea.rm_if_exists(stage2np + '/GraphAPI/Users')
    oea.rm_if_exists(stage2p + '/GraphAPI/Users')
    oea.drop_db('s2np_graphapi')
    oea.drop_db('s2p_graphapi')

#reset_all_processing()

StatementMeta(spark3p1sm, 59, 32, Finished, Available)

## 2. M365_app_user_detail table
Contains a sample m365 table to support data analysis in a Power BI dashboard.

**Databases and tables used:**
- None

**JSON files used:**
- m365_app_user_detail.json

**Databases and tables created:**

1. Spark DB: s2np_graphapi
- Table: m365_app_user_detail

In [33]:
dfM365UserActivity = dfM365UserActivityRaw.select(explode('value').alias('exploded_values')).select("exploded_values.*")

StatementMeta(spark3p1sm, 59, 33, Finished, Available)

### Processing m365 activity "details" data 
This code block moves the relevant data from "details" and allocates them into their respective columns.

In [34]:
import pyspark.sql.functions as f

dfM365UserActivity = dfM365UserActivity.withColumn('reportPeriod', f.explode(f.col('details').reportPeriod)) \
                        .withColumn('excel', f.explode(f.col('details').excel)) \
                        .withColumn('excelWeb', f.explode(f.col('details').excelWeb)) \
                        .withColumn('outlook', f.explode(f.col('details').outlook)) \
                        .withColumn('outlookWeb', f.explode(f.col('details').outlookWeb)) \
                        .withColumn('powerPoint', f.explode(f.col('details').powerPoint)) \
                        .withColumn('powerPointWeb', f.explode(f.col('details').powerPointWeb)) \
                        .withColumn('teams', f.explode(f.col('details').teams)) \
                        .withColumn('teamsWeb', f.explode(f.col('details').teamsWeb)) \
                        .withColumn('word', f.explode(f.col('details').word)) \
                        .withColumn('wordWeb', f.explode(f.col('details').wordWeb)) \
                        .drop('details')

display(dfM365UserActivity.limit(10))

StatementMeta(spark3p1sm, 59, 34, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6e6b7da5-705a-48f4-9594-236db094ea4e)

## Write Data Back to Lake

In [35]:
# write back to the lake in stage 2 ds2_main directory
dfM365UserActivity.write.format('parquet').mode('overwrite').save(stage2np + '/GraphAPI/M365_App_User_Detail')

StatementMeta(spark3p1sm, 59, 35, Finished, Available)

### Writing to Stage 2p
Pseudonymizing M365 data

In [36]:
m365Schema = [['reportRefreshDate', 'string', 'no-op'],
                        ['userPrincipalName', 'string', 'hash'],
                        ['lastActivityDate', 'string', 'no-op'],
                        ['reportPeriod', 'string', 'no-op'],
                        ['excel', 'string', 'no-op'],
                        ['excelWeb', 'string', 'no-op'],
                        ['outlook', 'string', 'no-op'],
                        ['outlookWeb', 'string', 'no-op'],
                        ['powerPoint', 'string', 'no-op'],
                        ['powerPointWeb', 'string', 'no-op'],
                        ['teams', 'string', 'no-op'],
                        ['teamsWeb', 'string', 'no-op'],
                        ['word', 'string', 'no-op'],
                        ['wordWeb', 'string', 'no-op']]


df_pseudo, df_lookup = oea.pseudonymize(dfM365UserActivity, m365Schema)

df_pseudo.write.format('parquet').mode('overwrite').save(stage2p + '/GraphAPI/M365_App_User_Detail')

StatementMeta(spark3p1sm, 59, 36, Finished, Available)

### Load to Spark DB

In [37]:
# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"DROP TABLE IF EXISTS {db_name}.m365_app_user_detail")
    spark.sql(f"create table if not exists {db_name}.m365_app_user_detail using PARQUET location '{source_path}'")
    
create_spark_db('s2np_graphapi', stage2np + '/GraphAPI/M365_App_User_Detail')
create_spark_db('s2p_graphapi', stage2p + '/GraphAPI/M365_App_User_Detail')

StatementMeta(spark3p1sm, 59, 37, Finished, Available)

## 3. Teams_activity_user_details table
Contains a sample Teams table to support data analysis in a Power BI dashboard.

**Databases and tables used:**
- None

**JSON files used:**
- teams_activity_user_details.json

**Databases and tables created:**

1. Spark DB: s2np_graphapi
- Table: teams_activity_user_details

In [38]:
dfTeamsUserActivity = dfTeamsUserActivityRaw.select(explode('value').alias('exploded_values')).select("exploded_values.*")
display(dfTeamsUserActivity.limit(10))

StatementMeta(spark3p1sm, 59, 38, Finished, Available)

SynapseWidget(Synapse.DataFrame, b0bee655-d79b-4f16-8f2c-f92305eedd29)

## Write Data Back to Lake

In [39]:
# write back to the lake in stage 2 ds2_main directory
dfTeamsUserActivity.write.format('parquet').mode('overwrite').save(stage2np + '/GraphAPI/Teams_Activity_User_Detail')

StatementMeta(spark3p1sm, 59, 39, Finished, Available)

### Writing to Stage 2p
Pseudonymizing Teams data

In [40]:
teamsSchema = [['reportRefreshDate', 'string', 'no-op'],
                        ['reportPeriod', 'string', 'no-op'],
                        ['userPrincipalName', 'string', 'hash'],
                        ['privateChatMessageCount', 'integer', 'no-op'],
                        ['teamChatMessageCount', 'integer', 'no-op'],
                        ['meetingsAttendedCount', 'integer', 'no-op'],
                        ['meetingCount', 'integer', 'no-op'],
                        ['audioDuration', 'string', 'no-op'],
                        ['videoDuration', 'string', 'no-op']]

df_pseudo, df_lookup = oea.pseudonymize(dfTeamsUserActivity, teamsSchema)

df_pseudo.write.format('parquet').mode('overwrite').save(stage2p + '/GraphAPI/Teams_Activity_User_Detail')

StatementMeta(spark3p1sm, 59, 40, Finished, Available)

### Load to Spark DB

In [41]:
# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"DROP TABLE IF EXISTS {db_name}.teams_activity_user_details")
    spark.sql(f"create table if not exists {db_name}.teams_activity_user_details using PARQUET location '{source_path}'")
    
create_spark_db('s2np_graphapi', stage2np + '/GraphAPI/Teams_Activity_User_Detail')
create_spark_db('s2p_graphapi', stage2p + '/GraphAPI/Teams_Activity_User_Detail')

# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"DROP TABLE IF EXISTS {db_name}.m365_app_user_detail")
    spark.sql(f"create table if not exists {db_name}.m365_app_user_detail using PARQUET location '{source_path}'")
    
create_spark_db('s2np_graphapi', stage2np + '/GraphAPI/M365_App_User_Detail')
create_spark_db('s2p_graphapi', stage2p + '/GraphAPI/M365_App_User_Detail')

StatementMeta(spark3p1sm, 59, 41, Finished, Available)

## Reset Data Processing

In [42]:
def reset_all_processing():
    oea.rm_if_exists(stage2np + '/GraphAPI/')
    oea.rm_if_exists(stage2p + '/GraphAPI/')
    oea.drop_db('s2np_graphapi')
    oea.drop_db('s2p_graphapi')

#reset_all_processing()

StatementMeta(spark3p1sm, 59, 42, Finished, Available)