## Contoso SIS
This notebook is for processing data for the fictional Contoso ISD.






In [1]:
storage_account = 'steduanalytics__update_this'
use_test_env = True

In [None]:
if use_test_env:
    stage1 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage1'
    stage2 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage2'
    stage3 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage3'
else:
    stage1 = 'abfss://stage1@' + storage_account + '.dfs.core.windows.net'
    stage2 = 'abfss://stage2@' + storage_account + '.dfs.core.windows.net'
    stage3 = 'abfss://stage3@' + storage_account + '.dfs.core.windows.net'

In [2]:
from pyspark.sql.functions import sha2, lit

# Process studentsectionmark and studentattendance
df = spark.read.csv(stage1 + '/contoso_sis/studentsectionmark.csv', header='true', inferSchema='true')
df = df.withColumn('id',df.id.cast('string')).withColumn('student_id',df.student_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/contoso_sis/studentsectionmark')

df = spark.read.csv(stage1 + '/contoso_sis/studentattendance.csv', header='true', inferSchema='true')
df = df.withColumn('id',df.id.cast('string')).withColumn('student_id',df.student_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/contoso_sis/studentattendance')

# Anonymize data and load into stage3
df = spark.read.format('parquet').load(stage2 + '/contoso_sis/studentsectionmark')
df = df.withColumn('id', sha2(df.id, 256)).withColumn('student_id',sha2(df.student_id, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/contoso_sis/studentsectionmark')

df = spark.read.format('parquet').load(stage2 + '/contoso_sis/studentattendance')
df = df.withColumn('id', sha2(df.id, 256)).withColumn('student_id',sha2(df.student_id, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/contoso_sis/studentattendance')

# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql("create table if not exists " + db_name + ".studentsectionmark using PARQUET location '" + source_path + "/studentsectionmark'")
    spark.sql("create table if not exists " + db_name + ".studentattendance using PARQUET location '" + source_path + "/studentattendance'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_contoso_sis', stage2 + '/contoso_sis')
create_spark_db(db_prefix + 's3_contoso_sis', stage3 + '/contoso_sis')