## Contoso ISD solution package
This notebook is for creating a consolidated view over the data from each of the source systems.






In [1]:
storage_account = 'steduanalytics__update_this'
use_test_env = True

In [2]:
if use_test_env:
    stage1 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage1'
    stage2 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage2'
    stage3 = 'abfss://test-env@' + storage_account + '.dfs.core.windows.net/stage3'
else:
    stage1 = 'abfss://stage1@' + storage_account + '.dfs.core.windows.net'
    stage2 = 'abfss://stage2@' + storage_account + '.dfs.core.windows.net'
    stage3 = 'abfss://stage3@' + storage_account + '.dfs.core.windows.net'

In [3]:
# Process sectionmark data
# Convert id values to use the Person.Id and Section.Id values set in the Education Data Platform.
from pyspark.sql.functions import sha2, lit

sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/contoso_sis/studentsectionmark'), 'SectionMark')

sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Person'), 'Person')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Section'), 'Section')

df = spark.sql("select sm.id Id, p.Id PersonId, s.Id SectionId, cast(sm.numeric_grade_earned as int) NumericGrade, \
sm.alpha_grade_earned AlphaGrade, sm.is_final_grade IsFinalGrade, cast(sm.credits_attempted as int) CreditsAttempted, cast(sm.credits_earned as int) CreditsEarned, \
sm.grad_credit_type GraduationCreditType, sm.id ExternalId, CURRENT_TIMESTAMP CreateDate, CURRENT_TIMESTAMP LastModifiedDate, true IsActive \
from SectionMark sm, Person p, Section s \
where sm.student_id = p.ExternalId \
and sm.section_id = s.ExternalId")

df.write.format('parquet').mode('overwrite').save(stage2 + '/ContosoISD/SectionMark')
df.write.format('parquet').mode('overwrite').save(stage2 + '/ContosoISD/SectionMark2')
# Add SectionMark data to stage3 (anonymized parquet lake)
df = df.withColumn('PersonId', sha2(df.PersonId, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/SectionMark')
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/SectionMark2')

# Repeat the above process, this time for student attendance
# Convert id values to use the Person.Id, Org.Id and Section.Id values
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/contoso_sis/studentattendance'), 'Attendance')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Org'), 'Org')

df = spark.sql("select att.id Id, p.Id PersonId, att.school_year SchoolYear, o.Id OrgId, to_date(att.attendance_date,'MM/dd/yyyy') AttendanceDate, \
att.all_day AllDay, att.Period Period, s.Id SectionId, att.AttendanceCode AttendanceCode, att.PresenceFlag PresenceFlag, \
att.attendance_status AttendanceStatus, att.attendance_type AttendanceType, att.attendance_sequence AttendanceSequence \
from Attendance att, Org o, Person p, Section s \
where att.student_id = p.ExternalId \
and att.school_id = o.ExternalId \
and att.section_id = s.ExternalId")

df.write.format('parquet').mode('overwrite').save(stage2 +'/ContosoISD/Attendance')
# Add Attendance data to stage3 (anonymized parquet lake)
df = df.withColumn('PersonId', sha2(df.PersonId, 256))
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/Attendance')

# Add 'Department' column to Course (hardcoded to "Math" for this Contoso example)
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(stage2 + '/m365/Course'), 'Course')
df = spark.sql("select Id, Name, Code, Description, ExternalId, CreateDate, LastModifiedDate, IsActive, CalendarId, 'Math' Department from Course")
df.write.format('parquet').mode('overwrite').save(stage2 + '/ContosoISD/Course')
df.write.format('parquet').mode('overwrite').save(stage3 + '/ContosoISD/Course')

# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql('CREATE DATABASE IF NOT EXISTS ' + db_name)
    spark.sql(f"create table if not exists " + db_name + ".Activity using PARQUET location '" + source_path + "/m365/Activity'")
    spark.sql(f"create table if not exists " + db_name + ".Calendar using PARQUET location '" + source_path + "/m365/Calendar'")
    spark.sql(f"create table if not exists " + db_name + ".Org using PARQUET location '" + source_path + "/m365/Org'")
    spark.sql(f"create table if not exists " + db_name + ".Person using PARQUET location '" + source_path + "/m365/Person'")
    spark.sql(f"create table if not exists " + db_name + ".PersonIdentifier using PARQUET location '" + source_path + "/m365/PersonIdentifier'")
    spark.sql(f"create table if not exists " + db_name + ".RefDefinition using PARQUET location '" + source_path + "/m365/RefDefinition'")
    spark.sql(f"create table if not exists " + db_name + ".Section using PARQUET location '" + source_path + "/m365/Section'")
    spark.sql(f"create table if not exists " + db_name + ".Session using PARQUET location '" + source_path + "/m365/Session'")
    spark.sql(f"create table if not exists " + db_name + ".StaffOrgAffiliation using PARQUET location '" + source_path + "/m365/StaffOrgAffiliation'")
    spark.sql(f"create table if not exists " + db_name + ".StaffSectionMembership using PARQUET location '" + source_path + "/m365/StaffSectionMembership'")
    spark.sql(f"create table if not exists " + db_name + ".StudentOrgAffiliation using PARQUET location '" + source_path + "/m365/StudentOrgAffiliation'")
    spark.sql(f"create table if not exists " + db_name + ".StudentSectionMembership using PARQUET location '" + source_path + "/m365/StudentSectionMembership'")

    spark.sql(f"create table if not exists " + db_name + ".Course using PARQUET location '" + source_path + "/ContosoISD/Course'")
    spark.sql(f"create table if not exists " + db_name + ".Attendance using PARQUET location '" + source_path + "/ContosoISD/Attendance'")
    spark.sql(f"create table if not exists " + db_name + ".SectionMark using PARQUET location '" + source_path + "/ContosoISD/SectionMark'")
    spark.sql(f"create table if not exists " + db_name + ".SectionMark2 using PARQUET location '" + source_path + "/ContosoISD/SectionMark2'")

db_prefix = 'test_' if use_test_env else ''
create_spark_db(db_prefix + 's2_ContosoISD', stage2)
create_spark_db(db_prefix + 's3_ContosoISD', stage3)