# Example data processing
This example demonstrates how a data engineer utilizes OEA to work with data from a new data source.

In [None]:
%run OEA_py

In [None]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace('sam')

In [None]:
# 2) Now land a batch data file into stage1 of the data lake.
# In this example we pull a test csv data file from github and it is landed in oea/sandboxes/sam/stage1/Transactional/contoso/v0.1/students/delta_batch_data/rundate=<utc datetime>
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day1/students/part1.csv').text
oea.land(data, 'contoso/v0.1/students', 'students.csv', oea.DELTA_BATCH_DATA)

In [None]:
# 3) You can verify that the data is in stage1 by reading it into a dataframe. Note that a "rundate" column has been added - representing the datetime that the batch data was landed in the data lake.
df = oea.load_csv(f'stage1/Transactional/contoso/v0.1/students')
display(df)

In [None]:
# 4) The next step is to ingest the batch data into stage2
# Note that when you run this the first time, you'll see an info message like "Number of new inbound rows processed: 2".
# If you run this a second time, the number of inbound rows processed will be 0 because the ingestion uses spark structured streaming to keep track of what data has already been processed.
oea.ingest(f'contoso/v0.1/students', 'SIS ID')

In [None]:
# 5) When data is ingested into stage2 of the data lake, OEA creates a lake db (which is a logical db that points to the data in the data lake).
# In this example, since you are working in the 'sam' workspace, the lake db created is called 'ldb_sam_s2i_contoso_v0p1' (if you click on Data in the left nav, you'll see the db listed under 'Lake database' )
df = spark.sql("select * from ldb_sam_s2i_contoso_v0p1.students")
display(df)
df.printSchema()

In [None]:
# 6) Now let's land some additional inbound batch data - with new and modified rows.
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day2/students/part1.csv').text
oea.land(data, 'contoso/v0.1/students', 'students.csv', oea.DELTA_BATCH_DATA)

In [None]:
# 7) Ingest this latest batch of data.
# Note that you don't have to specify what batch of data to process; OEA uses spark structured streaming to determine what files are new.
oea.ingest(f'contoso/v0.1/students', 'SIS ID')

In [None]:
# 8) Now verify that the batch data was ingested and correctly merged with the previous data

# You can load the ingested data into a dataframe directly like this...
df = oea.load('stage2/Ingested/contoso/v0.1/students')
display(df)

# ...or you can use the automatically created "Lake database" like this:
df = spark.sql("select * from ldb_sam_s2i_contoso_v0p1.students")
display(df)
# with either approach, you're querying the same data - it's the data stored at oea/sandboxes/sam/stage2/Ingested/contoso/v0.1/students in your data lake

In [None]:
# 9) After ingesting data, the next step is to refine the data through the use of metadata
metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/metadata.csv')
oea.refine('contoso/v0.1/students', metadata['students'], 'SIS ID')

In [None]:
# 10) Now you can query the refined data tables in the lake db
df = spark.sql("select * from ldb_sam_s2r_contoso_v0p1.students")
display(df)
df.printSchema()
df = spark.sql("select * from ldb_sam_s2r_contoso_v0p1.students_lookup")
display(df)
df.printSchema()
# You can use the "lookup" table for joins (people with restricted access won't be able to perform this query because they won't have access to data in the "sensitive" folder in the data lake)
df = spark.sql("select sl.Username, s.Grade from ldb_sam_s2r_contoso_v0p1.students_lookup sl, ldb_sam_s2r_contoso_v0p1.students s where sl.SIS_ID_pseudonym = s.SIS_ID_pseudonym")
display(df)

In [None]:
# 11) Land, ingest, and refine additional data sets (day 1)
# These data sets demonstrate the 2 other types of batch data - additive and snapshot.
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day1/studentattendance/part1.csv').text
oea.land(data, 'contoso/v0.1/studentattendance', 'part1.csv', oea.ADDITIVE_BATCH_DATA)
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day1/studentsectionmark/part1.csv').text
oea.land(data, 'contoso/v0.1/studentsectionmark', 'part1.csv', oea.SNAPSHOT_BATCH_DATA)

oea.ingest(f'contoso/v0.1/studentattendance', 'id')
oea.ingest(f'contoso/v0.1/studentsectionmark', 'id')

oea.refine('contoso/v0.1/studentattendance', metadata['studentattendance'], 'id')
oea.refine('contoso/v0.1/studentsectionmark', metadata['studentsectionmark'], 'id')

In [None]:
# 12) Land, ingest, and refine additional data sets (day 2)
# These data sets demonstrate the 2 other types of batch data - additive and snapshot.
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day2/studentattendance/part1.csv').text
oea.land(data, 'contoso/v0.1/studentattendance', 'part1.csv', oea.ADDITIVE_BATCH_DATA)
data = requests.get('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day2/studentsectionmark/part1.csv').text
oea.land(data, 'contoso/v0.1/studentsectionmark', 'part1.csv', oea.SNAPSHOT_BATCH_DATA)

oea.ingest(f'contoso/v0.1/studentattendance', 'id')
oea.ingest(f'contoso/v0.1/studentsectionmark', 'id')

oea.refine('contoso/v0.1/studentattendance', metadata['studentattendance'], 'id')
oea.refine('contoso/v0.1/studentsectionmark', metadata['studentsectionmark'], 'id')

In [None]:
# 12) Reset this example
oea.delete_dataset('contoso/v0.1')
oea.drop_lake_db('ldb_sam_s2i_contoso_v0p1')
oea.drop_lake_db('ldb_sam_s2r_contoso_v0p1')

# Appendix

In [None]:
# You can list the contents of a folder in the data lake like this:
print(oea.ls('stage1/Transactional/contoso/v0.1/students/delta_batch_data'))
print(oea.ls('stage2/Ingested/contoso/v0.1/students'))

In [None]:
dtbl = DeltaTable.forPath(spark, oea.to_url('stage2/Refined/contoso/v0.1/sensitive/students_lookup'))
display(dtbl.toDF())
#dtbl.delete("rundate > '2022-11-04T14:39:51'")