# Tests for processing SIF data

In [3]:
%run OEA_py

StatementMeta(, 11, -1, Finished, Available)

2022-11-27 20:27:18,357 - OEA - INFO - Now using workspace: dev
2022-11-27 20:27:18,358 - OEA - INFO - OEA initialized.


In [4]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace('sam')

StatementMeta(spark3p2sm, 11, 3, Finished, Available)

2022-11-27 20:27:20,540 - OEA - INFO - Now using workspace: sam


In [17]:
# 2) Now land a batch data file into stage1 of the data lake.
# In this example we pull test json data files from github and land it in oea/sandboxes/sam/stage1/Transactional/sif/v3.4.9
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/SIF/test_data/StudentPersonal.json').text
oea.land(data, 'sif/v3.4.9/StudentPersonal', 'StudentPersonal.json', oea.SNAPSHOT_BATCH_DATA)
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/SIF/test_data/StudentDailyAttendance.json').text
oea.land(data, 'sif/v3.4.9/StudentDailyAttendance', 'StudentDailyAttendance.json', oea.SNAPSHOT_BATCH_DATA)

StatementMeta(spark3p2sm, 11, 16, Finished, Available)

'stage1/Transactional/sif/v3.4.9/StudentDailyAttendance/snapshot_batch_data/rundate=2022-11-27 20-52-47/StudentDailyAttendance.json'

In [18]:
# 3) The next step is to ingest the batch data into stage2
# Note that when you run this the first time, you'll see an info message like "Number of new inbound rows processed: 2".
# If you run this a second time, the number of inbound rows processed will be 0 because the ingestion uses spark structured streaming to keep track of what data has already been processed.
options = {'multiline':True}
oea.ingest(f'sif/v3.4.9/StudentPersonal', 'RefId', options)
oea.ingest(f'sif/v3.4.9/StudentDailyAttendance', 'RefId', options)

StatementMeta(spark3p2sm, 11, 17, Finished, Available)

2022-11-27 20:53:44,302 - OEA - INFO - Ingesting from: stage1/Transactional/sif/v3.4.9/StudentPersonal, batch type of: snapshot, source data format of: json
source_path is: abfss://oea@stoeacisd31118b.dfs.core.windows.net/sandboxes/sam/stage1/Transactional/sif/v3.4.9/StudentPersonal/snapshot_batch_data/rundate=2022-11-27 20-52-47
2022-11-27 20:53:46,449 - py4j.java_gateway - INFO - Callback Connection ready to receive messages
2022-11-27 20:53:46,449 - py4j.java_gateway - INFO - Received command c on object id p4
2022-11-27 20:53:53,821 - OEA - INFO - Number of new inbound rows processed: 4
2022-11-27 20:53:55,938 - OEA - INFO - Ingesting from: stage1/Transactional/sif/v3.4.9/StudentDailyAttendance, batch type of: snapshot, source data format of: json
source_path is: abfss://oea@stoeacisd31118b.dfs.core.windows.net/sandboxes/sam/stage1/Transactional/sif/v3.4.9/StudentDailyAttendance/snapshot_batch_data/rundate=2022-11-27 20-52-47
2022-11-27 20:53:57,850 - py4j.java_gateway - INFO - Rec

4

In [None]:
# 4) Now you can run queries against the auto-generated "lake database" with the ingested sif data.
df = spark.sql("select * from ldb_sam_s2i_sif_v3p4p9.studentpersonal")
display(df)

In [None]:
# 5) After ingesting data, the next step is to refine the data through the use of metadata (this is where the pseudonymization of the data occurs).
#metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/SIF/test_data/metadata.csv')
metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/sif_dev/modules/module_catalog/SIF/test_data/metadata.csv')
oea.refine('sif/v3.4.9/StudentPersonal', metadata['studentpersonal'], 'RefId')
oea.refine('sif/v3.4.9/StudentDailyAttendance', metadata['studentdailyattendance'], 'RefId')


In [28]:
# 6) Now you can query the refined data tables in the lake db
df = spark.sql("select * from ldb_sam_s2r_sif_v3p4p9.studentpersonal")
display(df)
df.printSchema()
df = spark.sql("select * from ldb_sam_s2r_sif_v3p4p9.studentpersonal_lookup")
display(df)
df.printSchema()
# You can use the "lookup" table for joins (people with restricted access won't be able to perform this query because they won't have access to data in the "sensitive" folder in the data lake)
df = spark.sql("select sl.LocalId, s.Disability, s.ESL from ldb_sam_s2r_sif_v3p4p9.studentpersonal_lookup sl, ldb_sam_s2r_sif_v3p4p9.studentpersonal s where sl.RefId_pseudonym = s.RefId_pseudonym")
display(df)

StatementMeta(spark3p2sm, 11, 27, Finished, Available)

SynapseWidget(Synapse.DataFrame, 01a762fc-5dad-4134-91ca-e4250fc8e22a)

root
 |-- AcceptableUsePolicy: string (nullable = true)
 |-- AlertMessages: string (nullable = true)
 |-- Disability: string (nullable = true)
 |-- ESL: string (nullable = true)
 |-- ESLDateAssessed: string (nullable = true)
 |-- EconomicDisadvantage: string (nullable = true)
 |-- EducationSupport: string (nullable = true)
 |-- ElectronicIdList: string (nullable = true)
 |-- HomeSchooledStudent: string (nullable = true)
 |-- IndependentStudent: string (nullable = true)
 |-- IntegrationAide: string (nullable = true)
 |-- LocalCodeList: string (nullable = true)
 |-- LocalId_pseudonym: string (nullable = true)
 |-- MedicalAlertMessages: string (nullable = true)
 |-- MostRecent: string (nullable = true)
 |-- NationalUniqueStudentIdentifier: string (nullable = true)
 |-- OnTimeGraduationYear: string (nullable = true)
 |-- OtherIdList: string (nullable = true)
 |-- PersonInfo: string (nullable = true)
 |-- ProjectedGraduationYear: string (nullable = true)
 |-- RefId_pseudonym: string (nullab

SynapseWidget(Synapse.DataFrame, fabc1e1c-79ff-4702-bb46-c33c4c0c5a93)

root
 |-- LocalId: string (nullable = true)
 |-- PersonInfo: string (nullable = true)
 |-- RefId: string (nullable = true)
 |-- LocalId_pseudonym: string (nullable = true)
 |-- RefId_pseudonym: string (nullable = true)



SynapseWidget(Synapse.DataFrame, 13c040a3-16e2-4339-8a15-b555fb2abea2)

In [16]:
# Run this cell to reset this example (deleting all the example sif data in your workspace)
oea.rm_if_exists('stage1/Transactional/sif')
oea.rm_if_exists('stage2/Ingested/sif')
oea.rm_if_exists('stage2/Refined/sif')
oea.drop_lake_db('ldb_sam_s2i_sif_v3p4p9')
oea.drop_lake_db('ldb_sam_s2r_sif_v3p4p9')

StatementMeta(spark3p2sm, 11, 15, Finished, Available)

2022-11-27 20:52:32,227 - OEA - INFO - Database dropped: ldb_sam_s2i_sif_v3p4p9
2022-11-27 20:52:32,335 - OEA - INFO - Database dropped: ldb_sam_s2r_sif_v3p4p9


'Database dropped: ldb_sam_s2r_sif_v3p4p9'

# Appendix

In [19]:
# generate an initial metadata file for manual modification
metadata = oea.create_metadata_from_lake_db('ldb_sam_s2i_sif_v3p4p9')
dlw = DataLakeWriter(oea.to_url('stage1/Transactional/sif'))
dlw.write('metadata.csv', metadata)

StatementMeta(spark3p2sm, 11, 18, Finished, Available)

In [None]:
# Create a sql db for the ingested SIF data
oea.create_sql_db('stage2/Ingested/SIF')