## OpenEduAnalytics
This notebook is for creating a consolidated view over the data from each of the source systems.






In [None]:
storage_account = 'saeduanalytics'

stage1 = f'abfss://m365share@{storage_account}.dfs.core.windows.net'
stage2 = f'abfss://stage2@{storage_account}.dfs.core.windows.net'
stage3 = f'abfss://stage3@{storage_account}.dfs.core.windows.net'

In [None]:
# Process sectionmark.csv data (from stage1 into stage2 and stage3)
# Convert id values to use the Person.Id and Section.Id values set in the Education Data Platform.
from pyspark.sql.functions import sha2, lit
#df = spark.read.csv(f'{stage1_additional_path}/studentsectionmark.csv', header='true')
#sqlContext.registerDataFrameAsTable(df, 'SectionMark')

sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/datasense/studentsectionmark'), 'SectionMark')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/Person'), 'Person')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/Section'), 'Section')

df = spark.sql("select sm.id Id, p.Id PersonId, s.Id SectionId, cast(sm.numeric_grade_earned as int) NumericGrade, \
sm.alpha_grade_earned AlphaGrade, sm.is_final_grade IsFinalGrade, cast(sm.credits_attempted as int) CreditsAttempted, cast(sm.credits_earned as int) CreditsEarned, \
sm.grad_credit_type GraduationCreditType, sm.id ExternalId, CURRENT_TIMESTAMP CreateDate, CURRENT_TIMESTAMP LastModifiedDate, true IsActive \
from SectionMark sm, Person p, Section s \
where sm.student_id = p.ExternalId \
and sm.section_id = s.ExternalId")

df.write.format('parquet').mode('overwrite').save(f'{stage2}/OpenEduAnalytics/SectionMark')
df.write.format('parquet').mode('overwrite').save(f'{stage2}/OpenEduAnalytics/SectionMark2')

# Add SectionMark data to stage3 (anonymized parquet lake)
df = df.withColumn('PersonId', sha2(df.PersonId, 256))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/OpenEduAnalytics/SectionMark')
df.write.format('parquet').mode('overwrite').save(f'{stage3}/OpenEduAnalytics/SectionMark2')

In [None]:
# Process studentattendance.csv data (from stage1 into stage2 and stage3)
# Convert id values to use the Person.Id, Org.Id and Section.Id values set in the Education Data Platform.
from pyspark.sql.functions import sha2, lit

#df = spark.read.csv(f'{stage1_additional_path}/studentattendance.csv', header='true')
#sqlContext.registerDataFrameAsTable(df, 'Attendance')

sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/datasense/studentattendance'), 'Attendance')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/Org'), 'Org')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/Person'), 'Person')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/Section'), 'Section')
df = spark.sql("select att.id Id, p.Id PersonId, att.school_year SchoolYear, o.Id OrgId, to_date(att.attendance_date,'MM/dd/yyyy') AttendanceDate, \
att.all_day AllDay, att.Period Period, s.Id SectionId, att.AttendanceCode AttendanceCode, att.PresenceFlag PresenceFlag, \
att.attendance_status AttendanceStatus, att.attendance_type AttendanceType, att.attendance_sequence AttendanceSequence \
from Attendance att, Org o, Person p, Section s \
where att.student_id = p.ExternalId \
and att.school_id = o.ExternalId \
and att.section_id = s.ExternalId")

df.write.format('parquet').mode('overwrite').save(f'{stage2}/OpenEduAnalytics/Attendance')

# Add Attendance data to stage3 (anonymized parquet lake)
df = df.withColumn('PersonId', sha2(df.PersonId, 256))
df.write.format('parquet').mode('overwrite').save(f'{stage3}/OpenEduAnalytics/Attendance')

In [None]:
# Add 'Department' column to Course in edu_dl
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/m365/Course'), 'Course')
sqlContext.registerDataFrameAsTable(spark.read.format('parquet').load(f'{stage2}/datasense/course'), 'dsCourse')

df = spark.sql("select c1.Id, c1.Name, c1.Code, c1.Description, c1.ExternalId, c1.CreateDate, \
c1.LastModifiedDate, c1.IsActive, c1.CalendarId, c2.department Department \
from Course c1, dsCourse c2 \
where c1.ExternalId = c2.id")
df.write.format('parquet').mode('overwrite').save(f'{stage2}/OpenEduAnalytics/Course')
df.write.format('parquet').mode('overwrite').save(f'{stage3}/OpenEduAnalytics/Course')

In [None]:
# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"create table if not exists {db_name}.Activity using PARQUET location '{source_path}/m365/Activity'")
    spark.sql(f"create table if not exists {db_name}.Calendar using PARQUET location '{source_path}/m365/Calendar'")
    spark.sql(f"create table if not exists {db_name}.Org using PARQUET location '{source_path}/m365/Org'")
    spark.sql(f"create table if not exists {db_name}.Person using PARQUET location '{source_path}/m365/Person'")
    spark.sql(f"create table if not exists {db_name}.PersonIdentifier using PARQUET location '{source_path}/m365/PersonIdentifier'")
    spark.sql(f"create table if not exists {db_name}.RefDefinition using PARQUET location '{source_path}/m365/RefDefinition'")
    spark.sql(f"create table if not exists {db_name}.Section using PARQUET location '{source_path}/m365/Section'")
    spark.sql(f"create table if not exists {db_name}.Session using PARQUET location '{source_path}/m365/Session'")
    spark.sql(f"create table if not exists {db_name}.StaffOrgAffiliation using PARQUET location '{source_path}/m365/StaffOrgAffiliation'")
    spark.sql(f"create table if not exists {db_name}.StaffSectionMembership using PARQUET location '{source_path}/m365/StaffSectionMembership'")
    spark.sql(f"create table if not exists {db_name}.StudentOrgAffiliation using PARQUET location '{source_path}/m365/StudentOrgAffiliation'")
    spark.sql(f"create table if not exists {db_name}.StudentSectionMembership using PARQUET location '{source_path}/m365/StudentSectionMembership'")

    spark.sql(f"create table if not exists {db_name}.Course using PARQUET location '{source_path}/OpenEduAnalytics/Course'")
    spark.sql(f"create table if not exists {db_name}.Attendance using PARQUET location '{source_path}/OpenEduAnalytics/Attendance'")
    spark.sql(f"create table if not exists {db_name}.SectionMark using PARQUET location '{source_path}/OpenEduAnalytics/SectionMark'")
    spark.sql(f"create table if not exists {db_name}.SectionMark2 using PARQUET location '{source_path}/OpenEduAnalytics/SectionMark2'")

create_spark_db('s2_OpenEduAnalytics', stage2)
create_spark_db('s3_OpenEduAnalytics', stage3)

In [None]:
# Drop all tables in a db, then drop the db
def drop_db(db_name):
    df = spark.sql('SHOW TABLES FROM ' + db_name)
    for row in df.rdd.collect():
        spark.sql(f"DROP TABLE IF EXISTS {db_name}.{row['tableName']}")
    spark.sql(f"DROP DATABASE {db_name}")

drop_db('db_name')