# Student demographics table

### Notebook purpose:
- Create student table containing all student demographic information.

### Tables used: 
- Spark DB: s3_m365 (stage 3 m365 feed)
    - Table: person (student PersonId and ExternalId relationship)
- Spark DB: PLISQL (stage 3 PLI student and teacher data)
    - Table: tblPLIMSProfileStu (student SIS data and demographics)
- Spark DB: datasense (stage 3 SIS data)
    - Table: school (school ID to school name mapping)
    - Table: studentattendance (student attendance by date, school, and course section)

### Tables created:
- Spark DB: ds3_main (stage 3 data science main)
    - Table: Student (combined student demographic data)

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# data lake and container information
storage_account = 'saeduanalytics'
stage1 = f'abfss://stage1@{storage_account}.dfs.core.windows.net'
stage2 = f'abfss://stage2@{storage_account}.dfs.core.windows.net'
stage3 = f'abfss://stage3@{storage_account}.dfs.core.windows.net'

StatementMeta(spark3, 112, 1, Finished, Available)



## Raw Data from Lake


In [3]:
# load table from parquet data lake storage
dfPersonRaw = spark.read.format('parquet').load(f'{stage3}/m365/Person')
dfStudentRaw = spark.read.format('parquet').load(f'{stage3}/PLISQL/Year4/dbo.tblPLIMSProfileStu')
dfStudAttendanceRaw = spark.read.format('parquet').load(f'{stage3}/datasense_2021_06_11/studentattendance')
dfSchoolRaw = spark.read.format('parquet').load(f'{stage3}/datasense_2021_06_11/school')

StatementMeta(spark3, 111, 3, Finished, Available)



## Clean and Subset Data

In [4]:
# take only active students
dfPerson = dfPersonRaw.filter(dfPersonRaw.IsActive == 'True')

# take only needed columns and rename
dfPerson = dfPerson.select('Id','ExternalId')
dfPerson = dfPerson.withColumnRenamed('Id','PersonId')

StatementMeta(spark3, 111, 4, Finished, Available)



In [5]:
# rename columns to align with other data sources
dfStudent = dfStudentRaw
dfStudent = dfStudent.withColumnRenamed('StudentID','ExternalId')
dfStudent.show(2,vertical=True)

StatementMeta(spark3, 111, 5, Finished, Available)

-RECORD 0------------------------------------
 ExternalId           | 330e8381d384fa727... 
 SchoolYear           | 124                  
 GraduationYear       | 2017                 
 Grade                | 12                   
 GradeName            | 12th                 
 GradeDescription     | Twelfth Grade        
 ELL_Status           | English Learner      
 Homeless             | null                 
 FosterYouth          | null                 
 SpecialEducation     | Special Day Class    
 SpecialEducationType | SDC                  
 GATELabel            | null                 
 Migrant              | null                 
 ParentEducation      | Declined to state    
 LowIncome            | No                   
-RECORD 1------------------------------------
 ExternalId           | e6a0ccce3f5d9ee2c... 
 SchoolYear           | 124                  
 GraduationYear       | 2020                 
 Grade                | 12                   
 GradeName            | 12th      

In [6]:
# rename columns to align with other data sources
dfStudAttendance = dfStudAttendanceRaw.select('student_id', 'school_id', 'attendance_date', 'Period', 'section_id', 
                        'PresenceFlag', 'attendance_status')
dfStudAttendance = dfStudAttendance.withColumnRenamed('school_id', 'School_ID')
dfStudAttendance = dfStudAttendance.withColumnRenamed('student_id','ExternalId')
dfStudAttendance.show(1, vertical=True)

StatementMeta(spark3, 111, 6, Finished, Available)

-RECORD 0---------------------------------
 ExternalId        | abdf2ab11f6ed9f20... 
 School_ID         | 95                   
 attendance_date   | 2021-03-16 00:00:00  
 Period            | 05                   
 section_id        | 1023760              
 PresenceFlag      | 1                    
 attendance_status | Present              
only showing top 1 row

In [7]:
# rename columns to align with other data sources
dfSchool = dfSchoolRaw.select('id', 'name')
dfSchool = dfSchool.withColumnRenamed('id', 'School_ID')
dfSchool = dfSchool.withColumnRenamed('name', 'School_Name')

StatementMeta(spark3, 111, 7, Finished, Available)



In [8]:
dfStudAttendance = dfStudAttendance.join(dfSchool, 'School_ID')
dfStudAttendance.show(1,vertical=True, truncate=False)

StatementMeta(spark3, 111, 8, Finished, Available)

-RECORD 0-----------------------------------------------------------------------------
 School_ID         | 95                                                               
 ExternalId        | abdf2ab11f6ed9f202df67f62cc312f7ab760e0ae3ef6293dfa70904a07894d7 
 attendance_date   | 2021-03-16 00:00:00                                              
 Period            | 05                                                               
 section_id        | 1023760                                                          
 PresenceFlag      | 1                                                                
 attendance_status | Present                                                          
 School_Name       | Hoover High                                                      
only showing top 1 row

## Find Primary School

This creates a 1 to 1 relationship between student in school which enables Power BI dashboard functionality.

In [9]:
# find school which students have highest attendance count
df = (dfStudAttendance.groupBy("ExternalId", 'School_ID', 'School_Name')
    .agg(sum("PresenceFlag").alias("Present_Count")))


w = Window.partitionBy('ExternalId')
dfStudSchoolPrimary = df.withColumn('maxPres', max('Present_Count').over(w))\
    .where(col('Present_Count') == col('maxPres'))\
    .drop('maxPres').drop('Present_Count')

print(dfStudSchoolPrimary.show(10, vertical=True))

StatementMeta(spark3, 111, 9, Finished, Available)

-RECORD 0---------------------------
 ExternalId  | 002ec057c07a9b1b8... 
 School_ID   | 78                   
 School_Name | Bullard Talent K-8   
-RECORD 1---------------------------
 ExternalId  | 020ce7c083af2686d... 
 School_ID   | 41                   
 School_Name | Thomas Elementary    
-RECORD 2---------------------------
 ExternalId  | 028dc842ba82365bd... 
 School_ID   | 17                   
 School_Name | Ewing Elementary     
-RECORD 3---------------------------
 ExternalId  | 04c3641e7e3510679... 
 School_ID   | 64                   
 School_Name | Storey Elementary    
-RECORD 4---------------------------
 ExternalId  | 04eabd1cca8e90a70... 
 School_ID   | 16                   
 School_Name | Ericson Elementary   
-RECORD 5---------------------------
 ExternalId  | 066eb80bd1af4a559... 
 School_ID   | 31                   
 School_Name | Manchester Gate E... 
-RECORD 6---------------------------
 ExternalId  | 06de2e7133146af12... 
 School_ID   | 74                   
 

In [10]:
# rename columns to indicate primary school
dfStudSchoolPrimary = dfStudSchoolPrimary.withColumnRenamed('School_Name', 'School_Name_Primary')
dfStudSchoolPrimary = dfStudSchoolPrimary.withColumnRenamed('School_ID', 'School_ID_Primary')

StatementMeta(spark3, 111, 10, Finished, Available)



## Combine tables

In [11]:
dfStudentFinal = dfPerson.join(dfStudent, 'ExternalId')
dfStudentFinal = dfStudentFinal.join(dfStudSchoolPrimary, 'ExternalId')
dfStudentFinal.show(1, vertical=True, truncate=False)

StatementMeta(spark3, 111, 11, Finished, Available)

-RECORD 0--------------------------------------------------------------------------------
 ExternalId           | 002ec057c07a9b1b8823570a80d974ec21b34f8e0dd59424ee28ed6224f25088 
 PersonId             | 01e0084cdd9d55f45986b616a6af4a8886450855bb2d0e0ce77a4e48f0a8a603 
 SchoolYear           | 124                                                              
 GraduationYear       | null                                                             
 Grade                | 01                                                               
 GradeName            | 1st                                                              
 GradeDescription     | First Grade                                                      
 ELL_Status           | null                                                             
 Homeless             | null                                                             
 FosterYouth          | null                                                             
 SpecialEd

## Write Data Back to Lake

In [12]:
# write back to the lake
dfStudentFinal.write.format('parquet').mode('overwrite').save(stage3 + '/ds3_main/Studentv2')

StatementMeta(spark3, 111, 12, Finished, Available)



## Load to Spark DB

In [13]:
# Create spark db to allow for access to the data in the delta-lake via SQL on-demand.
# This is only creating metadata for SQL on-demand, pointing to the data in the delta-lake.
# This also makes it possible to connect in Power BI via the azure sql data source connector.
def create_spark_db(db_name, source_path):
    spark.sql(f'CREATE DATABASE IF NOT EXISTS {db_name}')
    spark.sql(f"DROP TABLE IF EXISTS {db_name}.Student")
    spark.sql(f"create table if not exists {db_name}.Student using PARQUET location '{source_path}/Student'")
    
create_spark_db('ds3_main', stage3 + '/ds3_main')

StatementMeta(spark3, 111, 13, Finished, Available)

