## All of the module notebooks combined into this single notebook.






In [2]:
storage_account = 'steduanalytics__update_this'
use_test_env = True

In [3]:
if use_test_env:
    stage1 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage1'
    stage2 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage2'
    stage3 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage3'
else:
    stage1 = 'abfss://stage1@' + storage_account + '.dfs.core.windows.net'
    stage2 = 'abfss://stage2@' + storage_account + '.dfs.core.windows.net'
    stage3 = 'abfss://stage3@' + storage_account + '.dfs.core.windows.net'



In [4]:
# Extracted from Clever_setup_and_update

# Process resource usage
df = spark.read.csv(stage1 + '/clever', header='true', inferSchema='true')
df = df.withColumn('sis_id',df.sis_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/clever/resource_usage_students')

# Anonymize data and load into stage3
from pyspark.sql.functions import sha2, lit
df = spark.read.format('parquet').load(stage2 + '/clever/resource_usage_students')
df = df.withColumn('sis_id', sha2(df.sis_id, 256)).withColumn('clever_user_id',lit('*')).withColumn('clever_school_id',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/clever/resource_usage_students')

# Create sql on-demand db for Clever data
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql("create table if not exists " + db_name + ".resource_usage_students using PARQUET location '" + source_path + "/resource_usage_students'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_clever', stage2 + '/clever')
create_spark_db(db_prefix + 's3_clever', stage3 + '/clever')

In [5]:
# Extracted from contoso_sis_setup_and_update

# Process studentsectionmark and studentattendance
df = spark.read.csv(stage1 + '/contoso_sis/studentsectionmark.csv', header='true', inferSchema='true')
df = df.withColumn('id',df.id.cast('string')).withColumn('student_id',df.student_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/contoso_sis/studentsectionmark')

df = spark.read.csv(stage1 + '/contoso_sis/studentattendance.csv', header='true', inferSchema='true')
df = df.withColumn('id',df.id.cast('string')).withColumn('student_id',df.student_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/contoso_sis/studentattendance')

# Anonymize data and load into stage3
df = spark.read.format('parquet').load(stage2 + '/contoso_sis/studentsectionmark')
df = df.withColumn('id', sha2(df.id, 256)).withColumn('student_id',sha2(df.student_id, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/contoso_sis/studentsectionmark')

df = spark.read.format('parquet').load(stage2 + '/contoso_sis/studentattendance')
df = df.withColumn('id', sha2(df.id, 256)).withColumn('student_id',sha2(df.student_id, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/contoso_sis/studentattendance')

# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql("create table if not exists " + db_name + ".studentsectionmark using PARQUET location '" + source_path + "/studentsectionmark'")
    spark.sql("create table if not exists " + db_name + ".studentattendance using PARQUET location '" + source_path + "/studentattendance'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_contoso_sis', stage2 + '/contoso_sis')
create_spark_db(db_prefix + 's3_contoso_sis', stage3 + '/contoso_sis')

In [6]:
# Extracted from iReady_setup_and_update

# Process personalized_instruction_by_lesson_math.csv
def remove_spaces(str): return str.replace(' ', '').replace('(','_').replace(')','_').replace('=', '__')

def process(filename):
  df = spark.read.csv(stage1 + '/iready/' + filename + '.csv', header='true', inferSchema='true')
  newColumns = map(remove_spaces, df.columns)
  df = df.toDF(*newColumns)
  df = df.withColumn('StudentID',df.StudentID.cast('string')) # StudentID needs to be a string to allow for hashing when moving into stage3
  df.write.format('parquet').mode('overwrite').save(stage2 + '/iready/' + filename)

process('comprehensive_student_lesson_activity_with_standards_ela')
process('comprehensive_student_lesson_activity_with_standards_math')
process('diagnostic_and_instruction_ela_ytd_window')
process('diagnostic_and_instruction_math_ytd_window')
process('diagnostic_results_ela')
process('diagnostic_results_math')
process('personalized_instruction_by_lesson_ela')
process('personalized_instruction_by_lesson_math')

# Anonymize data and load into stage3
from pyspark.sql.functions import sha2, lit

df = spark.read.format('parquet').load(stage2 + '/iready/comprehensive_student_lesson_activity_with_standards_ela')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/comprehensive_student_lesson_activity_with_standards_ela')

df = spark.read.format('parquet').load(stage2 + '/iready/comprehensive_student_lesson_activity_with_standards_math')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/comprehensive_student_lesson_activity_with_standards_math')

df = spark.read.format('parquet').load(stage2 + '/iready/diagnostic_and_instruction_ela_ytd_window')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*')).withColumn('UserName', lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/diagnostic_and_instruction_ela_ytd_window')

df = spark.read.format('parquet').load(stage2 + '/iready/diagnostic_and_instruction_math_ytd_window')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*')).withColumn('UserName', lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/diagnostic_and_instruction_math_ytd_window')

df = spark.read.format('parquet').load(stage2 + '/iready/diagnostic_results_ela')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/diagnostic_results_ela')

df = spark.read.format('parquet').load(stage2 + '/iready/diagnostic_results_math')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/diagnostic_results_math')

df = spark.read.format('parquet').load(stage2 + '/iready/personalized_instruction_by_lesson_ela')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/personalized_instruction_by_lesson_ela')

df = spark.read.format('parquet').load(stage2 + '/iready/personalized_instruction_by_lesson_math')
df = df.withColumn('StudentID', sha2(df.StudentID, 256)).withColumn('LastName',lit('*')).withColumn('FirstName',lit('*'))
df.write.format('parquet').mode('overwrite').save(stage3 + '/iready/personalized_instruction_by_lesson_math')

In [7]:
# Extracted from M365_setup_and_update

stage1_m365 = stage1 + '/m365/DIPData'
stage1_m365_activity = stage1 + '/m365/DIPData/Activity/ApplicationUsage'

# Process Roster data from stage 1 to stage 2
#
# Sets up the edu_dl (stage 2 data lake) with whatever data is found in the DIP inbound folder.
# This includes:
# - adding column names
# - casting values into a schema

# Calendar
df = spark.read.csv(stage1_m365 + '/Roster/Calendar.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'Calendar')
  df = spark.sql("select _c0 Id, _c1 Name, _c2 Description, cast(_c3 as int) SchoolYear, cast(_c4 as boolean) IsCurrent, _c5 ExternalId, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c7, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c8 as boolean) IsActive, _c9 OrgId from Calendar")
  df.write.format("parquet").mode("overwrite").save(stage2 + '/m365/Calendar')
# Course
df = spark.read.csv(stage1_m365 + '/Roster/Course.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'Course')
  df = spark.sql("select _c0 Id, _c1 Name, _c2 Code, _c3 Description, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 CalendarId from Course")
  df.write.format("parquet").mode("overwrite").save(stage2 + '/m365/Course')
# Org
df = spark.read.csv(stage1_m365 + '/Roster/Org.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'Org')
  df = spark.sql("select _c0 Id, _c1 Name, _c2 Identifier, _c3 ExternalId, to_timestamp(_c4, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c6 as boolean) IsActive, _c7 ParentOrgId, _c8 RefOrgTypeId, _c9 SourceSystemId from Org")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/Org')
# Person
df = spark.read.csv(stage1_m365 + '/Roster/Person.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'Person')
  df_Person = spark.sql("select _c0 Id, _c1 FirstName, _c2 MiddleName, _c3 LastName, _c4 GenerationCode, _c5 Prefix, _c6 EnabledUser, _c7 ExternalId, to_timestamp(_c8, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c9, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c10 as boolean) IsActive, _c11 SourceSystemId from Person")
  df_Person.write.format('parquet').mode('overwrite').save(stage2 + '/m365/Person')
# PersonIdentifier
df = spark.read.csv(stage1_m365 + '/Roster/PersonIdentifier.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'PersonIdentifier')
  df = spark.sql("select _c0 Id, _c1 Identifier, _c2 Description, _c3 RefIdentifierTypeId, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 PersonId, _c9 SourceSystemId from PersonIdentifier")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/PersonIdentifier')
# RefDefinition
df = spark.read.csv(stage1_m365 + '/Roster/RefDefinition.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'RefDefinition')
  df = spark.sql("select _c0 Id, _c1 RefType, _c2 Namespace, _c3 Code, cast(_c4 as int) SortOrder, _c5 Description, cast(_c6 as boolean) IsActive from RefDefinition")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/RefDefinition')
# Section
df = spark.read.csv(stage1_m365 + '/Roster/Section.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'Section')
  df = spark.sql("select _c0 Id, _c1 Name, _c2 Code, _c3 Location, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 CourseId, _c9 RefSectionTypeId, _c10 SessionId, _c11 OrgId from Section")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/Section')
# Session
df = spark.read.csv(stage1_m365 + '/Roster/Session.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'Session')
  df = spark.sql("select _c0 Id, _c1 Name, to_timestamp(_c2, 'MM/dd/yyyy hh:mm:ss a') BeginDate, to_timestamp(_c3, 'MM/dd/yyyy hh:mm:ss a') EndDate, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 CalendarId, _c9 ParentSessionId, _c10 RefSessionTypeId from Session")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/Session')
# StaffOrgAffiliation
df = spark.read.csv(stage1_m365 + '/Roster/StaffOrgAffiliation.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'StaffOrgAffiliation')
  df = spark.sql("select _c0 Id, cast(_c1 as boolean) IsPrimary, to_timestamp(_c2, 'MM/dd/yyyy hh:mm:ss a') EntryDate, to_timestamp(_c3, 'MM/dd/yyyy hh:mm:ss a') ExitDate, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 OrgId, _c9 PersonId, _c10 RefStaffOrgRoleId from StaffOrgAffiliation")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/StaffOrgAffiliation')
# StaffSectionMembership
df = spark.read.csv(stage1_m365 + '/Roster/StaffSectionMembership.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'StaffSectionMembership')
  df = spark.sql("select _c0 Id, cast(_c1 as boolean) IsPrimaryStaffForSection, to_timestamp(_c2, 'MM/dd/yyyy hh:mm:ss a') EntryDate, to_timestamp(_c3, 'MM/dd/yyyy hh:mm:ss a') ExitDate, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 PersonId, _c9 RefStaffSectionRoleId, _c10 SectionId from StaffSectionMembership")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/StaffSectionMembership')
# StudentOrgAffiliation
df = spark.read.csv(stage1_m365 + '/Roster/StudentOrgAffiliation.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'StudentOrgAffiliation')
  df = spark.sql("select _c0 Id, cast(_c1 as boolean) IsPrimary, to_timestamp(_c2, 'MM/dd/yyyy hh:mm:ss a') EntryDate, to_timestamp(_c3, 'MM/dd/yyyy hh:mm:ss a') ExitDate, _c4 ExternalId, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c6, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c7 as boolean) IsActive, _c8 OrgId, _c9 PersonId, _c10 RefGradeLevelId, _c11 RefStudentOrgRoleId, _c12 RefEnrollmentStatusId from StudentOrgAffiliation")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/StudentOrgAffiliation')
# StudentSectionMembership
df = spark.read.csv(stage1_m365 + '/Roster/StudentSectionMembership.csv', header='false')
if (df.count() > 0):
  sqlContext.registerDataFrameAsTable(df, 'StudentSectionMembership')
  df = spark.sql("select _c0 Id, to_timestamp(_c1, 'MM/dd/yyyy hh:mm:ss a') EntryDate, to_timestamp(_c2, 'MM/dd/yyyy hh:mm:ss a') ExitDate, _c3 ExternalId, to_timestamp(_c4, 'MM/dd/yyyy hh:mm:ss a') CreateDate, to_timestamp(_c5, 'MM/dd/yyyy hh:mm:ss a') LastModifiedDate, cast(_c6 as boolean) IsActive, _c7 PersonId, _c8 RefGradeLevelWhenCourseTakenId, _c9 RefStudentSectionRoleId, _c10 SectionId from StudentSectionMembership")
  df.write.format('parquet').mode('overwrite').save(stage2 + '/m365/StudentSectionMembership')



# Process Activity data from stage1 into stage2.
#
# If this is the first load, it loads all activity data.
# If this is a subsequent load, it determines the max date currently stored and only loads data from after that date.

def append_to_activity_table(max_date=False):
    df = spark.read.csv(stage1_m365_activity, header='false') 
    sqlContext.registerDataFrameAsTable(df, 'Activity')
    df_Activity = spark.sql("select to_timestamp(_c0) BinDate, _c1 Upn, _c2 UserId, _c3 Application, cast(_c4 as int) SumNumberOfSignals, _c5 Client, cast(_c6 as int) Duration, _c7 LearningActivity, '' PersonId from Activity")
    
    if (max_date):
        df_Activity = df_Activity.filter(df_Activity.BinDate > max_date)

    if (df_Activity.count() == 0):
        print('No new activity data to load')
    else:
        print('Adding activity data later than: ' + str(max_date))
        # The assumption here is that there will always be data in these inbound files
        sqlContext.registerDataFrameAsTable(df_Activity, 'Activity')
        sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/PersonIdentifier'), 'PersonIdentifier')
        sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/RefDefinition'), 'RefDefinition')

        df1 = spark.sql( \
        "select act.BinDate, act.Upn, act.UserId, act.Application, act.SumNumberOfSignals, act.Client, act.Duration, act.LearningActivity, pi.PersonId \
        from PersonIdentifier pi, RefDefinition rd, Activity act \
        where \
            pi.RefIdentifierTypeId = rd.Id \
            and rd.RefType = 'RefIdentifierType' \
            and rd.Code = 'username' \
            and pi.Identifier = act.Upn \
            and act.Upn <> '' \
        ")

        df2 = spark.sql( \
        "select act.BinDate, act.Upn, act.UserId, act.Application, act.SumNumberOfSignals, act.Client, act.Duration, act.LearningActivity, pi.PersonId \
        from PersonIdentifier pi, RefDefinition rd, Activity act\
        where \
            pi.RefIdentifierTypeId = rd.Id\
            and rd.RefType = 'RefIdentifierType'\
            and rd.Code = 'ActiveDirectoryId'\
            and pi.Identifier = act.UserId\
            and act.UserId is not null\
        ")

        df1.write.format("parquet").mode("append").save(f'{stage2}/m365/Activity')
        df2.write.format("parquet").mode("append").save(f'{stage2}/m365/Activity')

try:
    df = spark.read.format('parquet').load(f'{stage2}/m365/Activity')
    max_date = df.agg({'BinDate': 'max'}).first()[0]
    print(max_date)
    append_to_activity_table(max_date)
except:
    print("No Activity data has been loaded into stage2 data lake yet.")
    append_to_activity_table()



# Anonymize the data from edu_dl (stage2) and load into anon_edu_dl (stage3)
# - redact columns from Person table
# - apply a hash to every occurrence of PersonId
# - redact UPN and UserId from Activity table
# - don't bring in PersonIdentifier, Student, Staff and other tables not needed or empty (some tables are not being populated by EDP)

from pyspark.sql.functions import sha2, lit
# Activity
df = spark.read.format('parquet').load(f'{stage2}/m365/Activity')
df = df.withColumn('PersonId', sha2(df.PersonId, 256)).withColumn('Upn', lit('*')).withColumn('UserId', lit('*'))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Activity')
# Calendar, Course, Org
spark.read.format('parquet').load(f'{stage2}/m365/Calendar').write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Calendar')
spark.read.format('parquet').load(f'{stage2}/m365/Course').write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Course')
spark.read.format('parquet').load(f'{stage2}/m365/Org').write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Org')
# Person
df = spark.read.format('parquet').load(f'{stage2}/m365/Person')
df = df.withColumn('Id', sha2(df.Id, 256)).withColumn('FirstName', lit('*')).withColumn("MiddleName", lit('*')).withColumn('LastName', lit('*')).withColumn('ExternalId', sha2(df.ExternalId, 256))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Person')
# PersonIdentifier
df = spark.read.format('parquet').load(f'{stage2}/m365/PersonIdentifier')
df = df.withColumn('PersonId', sha2(df.Id, 256)).withColumn('Identifier', lit('*')).withColumn("ExternalId", lit('*'))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/PersonIdentifier')
# RefDefinition, Section, Session
spark.read.format('parquet').load(f'{stage2}/m365/RefDefinition').write.format('parquet').mode('overwrite').save(f'{stage3}/m365/RefDefinition')
spark.read.format('parquet').load(f'{stage2}/m365/Section').write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Section')
spark.read.format('parquet').load(f'{stage2}/m365/Session').write.format('parquet').mode('overwrite').save(f'{stage3}/m365/Session')
# StaffOrgAffiliation
df = spark.read.format('parquet').load(f'{stage2}/m365/StaffOrgAffiliation')
df = df.withColumn('PersonId', sha2(df.PersonId, 256)).withColumn('ExternalId', lit('*'))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/StaffOrgAffiliation')
# StaffSectionMembership
df = spark.read.format('parquet').load(f'{stage2}/m365/StaffSectionMembership')
df = df.withColumn('PersonId', sha2(df.PersonId, 256)).withColumn('ExternalId', lit('*'))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/StaffSectionMembership')
# StudentOrgAffiliation
df = spark.read.format('parquet').load(f'{stage2}/m365/StudentOrgAffiliation')
df = df.withColumn('PersonId', sha2(df.PersonId, 256)).withColumn('ExternalId', lit('*'))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/StudentOrgAffiliation')
# StudentSectionMembership
df = spark.read.format('parquet').load(f'{stage2}/m365/StudentSectionMembership')
df = df.withColumn('PersonId', sha2(df.PersonId, 256)).withColumn('ExternalId', lit('*'))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/m365/StudentSectionMembership')


# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql("create table if not exists " + db_name + ".Activity using PARQUET location '" + source_path + "/Activity'")
    spark.sql("create table if not exists " + db_name + ".Calendar using PARQUET location '" + source_path + "/Calendar'")
    spark.sql("create table if not exists " + db_name + ".Course using PARQUET location '" + source_path + "/Course'")
    spark.sql("create table if not exists " + db_name + ".Org using PARQUET location '" + source_path + "/Org'")
    spark.sql("create table if not exists " + db_name + ".Person using PARQUET location '" + source_path + "/Person'")
    spark.sql("create table if not exists " + db_name + ".PersonIdentifier using PARQUET location '" + source_path + "/PersonIdentifier'")
    spark.sql("create table if not exists " + db_name + ".RefDefinition using PARQUET location '" + source_path + "/RefDefinition'")
    spark.sql("create table if not exists " + db_name + ".Section using PARQUET location '" + source_path + "/Section'")
    spark.sql("create table if not exists " + db_name + ".Session using PARQUET location '" + source_path + "/Session'")
    spark.sql("create table if not exists " + db_name + ".StaffOrgAffiliation using PARQUET location '" + source_path + "/StaffOrgAffiliation'")
    spark.sql("create table if not exists " + db_name + ".StaffSectionMembership using PARQUET location '" + source_path + "/StaffSectionMembership'")
    spark.sql("create table if not exists " + db_name + ".StudentOrgAffiliation using PARQUET location '" + source_path + "/StudentOrgAffiliation'")
    spark.sql("create table if not exists " + db_name + ".StudentSectionMembership using PARQUET location '" + source_path + "/StudentSectionMembership'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_m365', stage2 + '/m365')
create_spark_db(db_prefix + 's3_m365', stage3 + '/m365')

No Activity data has been loaded into stage2 data lake yet.
Adding activity data later than: False

In [8]:
# Extracted from Contoso_ISD_setup
from pyspark.sql.functions import sha2, lit

# Process studentsectionmark and studentattendance
df = spark.read.csv(stage1 + '/contoso_sis/studentsectionmark.csv', header='true', inferSchema='true')
df = df.withColumn('id',df.id.cast('string')).withColumn('student_id',df.student_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/contoso_sis/studentsectionmark')

df = spark.read.csv(stage1 + '/contoso_sis/studentattendance.csv', header='true', inferSchema='true')
df = df.withColumn('id',df.id.cast('string')).withColumn('student_id',df.student_id.cast('string'))
df.write.format('parquet').mode('overwrite').save(stage2 + '/contoso_sis/studentattendance')

# Anonymize data and load into stage3
df = spark.read.format('parquet').load(stage2 + '/contoso_sis/studentsectionmark')
df = df.withColumn('id', sha2(df.id, 256)).withColumn('student_id',sha2(df.student_id, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/contoso_sis/studentsectionmark')

df = spark.read.format('parquet').load(stage2 + '/contoso_sis/studentattendance')
df = df.withColumn('id', sha2(df.id, 256)).withColumn('student_id',sha2(df.student_id, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/contoso_sis/studentattendance')

# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql("create table if not exists " + db_name + ".studentsectionmark using PARQUET location '" + source_path + "/studentsectionmark'")
    spark.sql("create table if not exists " + db_name + ".studentattendance using PARQUET location '" + source_path + "/studentattendance'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_contoso_sis', stage2 + '/contoso_sis')
create_spark_db(db_prefix + 's3_contoso_sis', stage3 + '/contoso_sis')

In [9]:
# Extracted from Contoso_ISD_setup_and_update

# Process sectionmark data
# Convert id values to use the Person.Id and Section.Id values set in the Education Data Platform.
from pyspark.sql.functions import sha2, lit

sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/contoso_sis/studentsectionmark'), 'SectionMark')

sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Person'), 'Person')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Section'), 'Section')

df = spark.sql("select sm.id Id, p.Id PersonId, s.Id SectionId, cast(sm.numeric_grade_earned as int) NumericGrade, \
sm.alpha_grade_earned AlphaGrade, sm.is_final_grade IsFinalGrade, cast(sm.credits_attempted as int) CreditsAttempted, cast(sm.credits_earned as int) CreditsEarned, \
sm.grad_credit_type GraduationCreditType, sm.id ExternalId, CURRENT_TIMESTAMP CreateDate, CURRENT_TIMESTAMP LastModifiedDate, true IsActive \
from SectionMark sm, Person p, Section s \
where sm.student_id = p.ExternalId \
and sm.section_id = s.ExternalId")

df.write.format('parquet').mode('overwrite').save(stage2 + '/ContosoISD/SectionMark')
df.write.format('parquet').mode('overwrite').save(stage2 + '/ContosoISD/SectionMark2')
# Add SectionMark data to stage3 (anonymized parquet lake)
df = df.withColumn('PersonId', sha2(df.PersonId, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/SectionMark')
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/SectionMark2')

# Repeat the above process, this time for student attendance
# Convert id values to use the Person.Id, Org.Id and Section.Id values
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/contoso_sis/studentattendance'), 'Attendance')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Org'), 'Org')

df = spark.sql("select att.id Id, p.Id PersonId, att.school_year SchoolYear, o.Id OrgId, to_date(att.attendance_date,'MM/dd/yyyy') AttendanceDate, \
att.all_day AllDay, att.Period Period, s.Id SectionId, att.AttendanceCode AttendanceCode, att.PresenceFlag PresenceFlag, \
att.attendance_status AttendanceStatus, att.attendance_type AttendanceType, att.attendance_sequence AttendanceSequence \
from Attendance att, Org o, Person p, Section s \
where att.student_id = p.ExternalId \
and att.school_id = o.ExternalId \
and att.section_id = s.ExternalId")

df.write.format('parquet').mode('overwrite').save(stage2 +'/ContosoISD/Attendance')
# Add Attendance data to stage3 (anonymized parquet lake)
df = df.withColumn('PersonId', sha2(df.PersonId, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/Attendance')

# Add 'Department' column to Course (hardcoded to "Math" for this Contoso example)
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Course'), 'Course')
df = spark.sql("select Id, Name, Code, Description, ExternalId, CreateDate, LastModifiedDate, IsActive, CalendarId, 'Math' Department from Course")
df.write.format('parquet').mode('overwrite').save(stage2 + '/ContosoISD/Course')
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/Course')

# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql(f"create table if not exists " + db_name + ".Activity using PARQUET location '" + source_path + "/m365/Activity'")
    spark.sql(f"create table if not exists " + db_name + ".Calendar using PARQUET location '" + source_path + "/m365/Calendar'")
    spark.sql(f"create table if not exists " + db_name + ".Org using PARQUET location '" + source_path + "/m365/Org'")
    spark.sql(f"create table if not exists " + db_name + ".Person using PARQUET location '" + source_path + "/m365/Person'")
    spark.sql(f"create table if not exists " + db_name + ".PersonIdentifier using PARQUET location '" + source_path + "/m365/PersonIdentifier'")
    spark.sql(f"create table if not exists " + db_name + ".RefDefinition using PARQUET location '" + source_path + "/m365/RefDefinition'")
    spark.sql(f"create table if not exists " + db_name + ".Section using PARQUET location '" + source_path + "/m365/Section'")
    spark.sql(f"create table if not exists " + db_name + ".Session using PARQUET location '" + source_path + "/m365/Session'")
    spark.sql(f"create table if not exists " + db_name + ".StaffOrgAffiliation using PARQUET location '" + source_path + "/m365/StaffOrgAffiliation'")
    spark.sql(f"create table if not exists " + db_name + ".StaffSectionMembership using PARQUET location '" + source_path + "/m365/StaffSectionMembership'")
    spark.sql(f"create table if not exists " + db_name + ".StudentOrgAffiliation using PARQUET location '" + source_path + "/m365/StudentOrgAffiliation'")
    spark.sql(f"create table if not exists " + db_name + ".StudentSectionMembership using PARQUET location '" + source_path + "/m365/StudentSectionMembership'")

    spark.sql(f"create table if not exists " + db_name + ".Course using PARQUET location '" + source_path + "/ContosoISD/Course'")
    spark.sql(f"create table if not exists " + db_name + ".Attendance using PARQUET location '" + source_path + "/ContosoISD/Attendance'")
    spark.sql(f"create table if not exists " + db_name + ".SectionMark using PARQUET location '" + source_path + "/ContosoISD/SectionMark'")
    spark.sql(f"create table if not exists " + db_name + ".SectionMark2 using PARQUET location '" + source_path + "/ContosoISD/SectionMark2'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_ContosoISD', stage2)
create_spark_db(db_prefix + 's3_ContosoISD', stage3)