# Example data processing
This example demonstrates how a data engineer utilizes OEA to work with data from a new data source.

In [2]:
%run OEA_py

StatementMeta(, 23, -1, Finished, Available)

2022-11-15 13:02:22,909 - OEA - INFO - Now using workspace: dev
2022-11-15 13:02:22,909 - OEA - INFO - OEA initialized.


In [3]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace('sam')

StatementMeta(spark3p2sm, 23, 3, Finished, Available)

2022-11-15 13:02:26,530 - OEA - INFO - Now using workspace: sam


In [5]:
# 2) Now land a batch data file into stage1 of the data lake.
# In this example we pull a test csv data file from github and it is landed in oea/sandboxes/sam/stage1/Transactional/contoso/v0.1/students/delta_batch_data/rundate=<utc datetime>
data = oea.get_text_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day1/students/part1.csv')
oea.land(data, 'contoso/v0.1/students', 'students.csv', oea.DELTA_BATCH_DATA)

StatementMeta(spark3p2sm, 23, 5, Finished, Available)

'stage1/Transactional/contoso/v0.1/students/delta_batch_data/rundate=2022-11-15 13:03:25/students.csv'

In [6]:
# 3) You can verify that the data is in stage1 by reading it into a dataframe. Note that a "rundate" column has been added - representing the datetime that the batch data was landed in the data lake.
df = oea.load_csv(f'stage1/Transactional/contoso/v0.1/students')
display(df)

StatementMeta(spark3p2sm, 23, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, 76e539ce-6408-4e5d-abd7-f4a5572bdfb1)

In [7]:
# 4) The next step is to ingest the batch data into stage2
# Note that when you run this the first time, you'll see an info message like "Number of new inbound rows processed: 2".
# If you run this a second time, the number of inbound rows processed will be 0 because the ingestion uses spark structured streaming to keep track of what data has already been processed.
oea.ingest(f'contoso/v0.1/students', 'SIS ID')

StatementMeta(spark3p2sm, 23, 7, Finished, Available)

2022-11-15 13:04:15,268 - OEA - INFO - Ingesting from: stage1/Transactional/contoso/v0.1/students, batch type of: delta, source data format of: csv
2022-11-15 13:04:16,077 - py4j.java_gateway - INFO - Callback Server Starting
2022-11-15 13:04:16,078 - py4j.java_gateway - INFO - Socket listening on ('127.0.0.1', 40449)
2022-11-15 13:04:18,892 - py4j.java_gateway - INFO - Callback Connection ready to receive messages
2022-11-15 13:04:18,893 - py4j.java_gateway - INFO - Received command c on object id p0
2022-11-15 13:04:40,727 - OEA - INFO - Number of new inbound rows processed: 2


2

In [8]:
# 5) When data is ingested into stage2 of the data lake, OEA creates a lake db (which is a logical db that points to the data in the data lake).
# In this example, since you are working in the 'sam' workspace, the lake db created is called 'ldb_sam_s2i_contoso_v0p1' (if you click on Data in the left nav, you'll see the db listed under 'Lake database' )
df = spark.sql("select * from ldb_sam_s2i_contoso_v0p1.students")
display(df)
#df.printSchema()

StatementMeta(spark3p2sm, 23, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6b22f793-8d23-482d-88a8-2ff27e83d560)

In [9]:
# 6) Now let's land some additional inbound batch data - with new and modified rows.
data = oea.get_text_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day2/students/part1.csv')
oea.land(data, 'contoso/v0.1/students', 'students.csv', oea.DELTA_BATCH_DATA)

StatementMeta(spark3p2sm, 23, 9, Finished, Available)

'stage1/Transactional/contoso/v0.1/students/delta_batch_data/rundate=2022-11-15 13:04:57/students.csv'

In [11]:
# 7) Ingest this latest batch of data.
# Note that you don't have to specify what batch of data to process; OEA uses spark structured streaming to determine what files are new.
oea.ingest(f'contoso/v0.1/students', 'SIS ID')

StatementMeta(spark3p2sm, 23, 11, Finished, Available)

2022-11-15 13:05:26,613 - OEA - INFO - Ingesting from: stage1/Transactional/contoso/v0.1/students, batch type of: delta, source data format of: csv
2022-11-15 13:05:27,811 - OEA - INFO - Number of new inbound rows processed: 0


0

In [12]:
# 8) Now verify that the batch data was ingested and correctly merged with the previous data

# You can load the ingested data into a dataframe directly like this...
df = oea.load('stage2/Ingested/contoso/v0.1/students')
display(df)

# ...or you can use the automatically created "Lake database" like this:
df = spark.sql("select * from ldb_sam_s2i_contoso_v0p1.students")
display(df)
# with either approach, you're querying the same data - it's the data stored at oea/sandboxes/sam/stage2/Ingested/contoso/v0.1/students in your data lake

StatementMeta(spark3p2sm, 23, 12, Finished, Available)

SynapseWidget(Synapse.DataFrame, e10b1c1e-ea51-41de-b6f1-920647c7e8f6)

SynapseWidget(Synapse.DataFrame, 1ac1e1d5-5380-4d6a-9ddf-88390ecb86be)

In [13]:
# 9) After ingesting data, the next step is to refine the data through the use of metadata
metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/metadata.csv')
oea.refine('contoso/v0.1/students', metadata['students'], 'SIS ID')

StatementMeta(spark3p2sm, 23, 13, Finished, Available)

2022-11-15 13:06:57,299 - OEA - INFO - Processed 3 updated rows from stage2/Ingested/contoso/v0.1/students into stage2/Refined


3

In [14]:
# 11) Now you can query the refined data tables in the lake db
df = spark.sql("select * from ldb_sam_s2r_contoso_v0p1.students")
display(df)
df = spark.sql("select * from ldb_sam_s2r_contoso_v0p1.students_lookup")
display(df)
# You can use the "lookup" table for joins (people with restricted access won't be able to perform this query because they won't have access to data in the "sensitive" folder in the data lake)
df = spark.sql("select sl.Username, s.Grade from ldb_sam_s2r_contoso_v0p1.students_lookup sl, ldb_sam_s2r_contoso_v0p1.students s where sl.SIS_ID_pseudonym = s.SIS_ID_pseudonym")
display(df)

StatementMeta(spark3p2sm, 23, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, 924a0bb4-69c8-4cc7-9aa8-5b0b3ad361e4)

SynapseWidget(Synapse.DataFrame, 4123933f-0954-45c3-a154-29f3d2c711aa)

SynapseWidget(Synapse.DataFrame, ac2ecca3-e71e-4dda-9d78-04d133992a19)

In [19]:
# 11) Land, ingest, and refine additional data sets.
# These data sets demonstrate the 2 other types of batch data - additive and snapshot.
data = oea.get_text_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day1/studentattendance/part1.csv')
oea.land(data, 'contoso/v0.1/studentattendance', 'part1.csv', oea.ADDITIVE_BATCH_DATA)
data = oea.get_text_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day2/studentattendance/part1.csv')
oea.land(data, 'contoso/v0.1/studentattendance', 'part1.csv', oea.ADDITIVE_BATCH_DATA)


data = oea.get_text_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day1/studentsectionmark/part1.csv')
oea.land(data, 'contoso/v0.1/studentsectionmark', 'part1.csv', oea.SNAPSHOT_BATCH_DATA)
data = oea.get_text_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/day2/studentsectionmark/part1.csv')
oea.land(data, 'contoso/v0.1/studentsectionmark', 'part1.csv', oea.SNAPSHOT_BATCH_DATA)

oea.ingest(f'contoso/v0.1/studentattendance', 'id')
oea.ingest(f'contoso/v0.1/studentsectionmark', 'id')

metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/framework/test_data/contoso_sis/metadata.csv')
oea.refine('contoso/v0.1/studentattendance', metadata['studentattendance'], 'id')
oea.refine('contoso/v0.1/studentsectionmark', metadata['studentsectionmark'], 'id')

StatementMeta(spark3p2sm, 23, 19, Finished, Available)

2022-11-15 13:13:25,935 - OEA - INFO - Ingesting from: stage1/Transactional/contoso/v0.1/studentattendance, batch type of: additive, source data format of: csv
2022-11-15 13:13:27,802 - py4j.java_gateway - INFO - Callback Connection ready to receive messages
2022-11-15 13:13:27,803 - py4j.java_gateway - INFO - Received command c on object id p10
2022-11-15 13:13:30,160 - OEA - INFO - Number of new inbound rows processed: 4392
2022-11-15 13:13:30,429 - OEA - INFO - Ingesting from: stage1/Transactional/contoso/v0.1/studentsectionmark, batch type of: snapshot, source data format of: csv
2022-11-15 13:13:31,918 - py4j.java_gateway - INFO - Received command c on object id p11
2022-11-15 13:13:34,297 - OEA - INFO - Number of new inbound rows processed: 24


Py4JJavaError: An error occurred while calling o3251.execute.
: java.lang.UnsupportedOperationException: Cannot perform Merge as multiple source rows matched and attempted to modify the same
target row in the Delta table in possibly conflicting ways. By SQL semantics of Merge,
when multiple source rows match on the same target row, the result may be ambiguous
as it is unclear which source row should be used to update or delete the matching
target row. You can preprocess the source table to eliminate the possibility of
multiple matches. Please refer to
https://docs.delta.io/latest/delta-update.html#upsert-into-a-table-using-merge
	at org.apache.spark.sql.delta.DeltaErrors$.multipleSourceRowMatchingTargetRowInMergeException(DeltaErrors.scala:718)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$findTouchedFiles$1(MergeIntoCommand.scala:423)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:735)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.findTouchedFiles(MergeIntoCommand.scala:367)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2(MergeIntoCommand.scala:313)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2$adapted(MergeIntoCommand.scala:295)
	at org.apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:226)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$1(MergeIntoCommand.scala:295)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:121)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:119)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordFrameProfile(MergeIntoCommand.scala:210)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperation$5(DeltaLogging.scala:115)
	at com.microsoft.spark.telemetry.delta.SynapseLoggingShim.recordOperation(SynapseLoggingShim.scala:95)
	at com.microsoft.spark.telemetry.delta.SynapseLoggingShim.recordOperation$(SynapseLoggingShim.scala:81)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordOperation(MergeIntoCommand.scala:210)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:114)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:99)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordDeltaOperation(MergeIntoCommand.scala:210)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.run(MergeIntoCommand.scala:293)
	at io.delta.tables.DeltaMergeBuilder.$anonfun$execute$1(DeltaMergeBuilder.scala:231)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError(AnalysisHelper.scala:105)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError$(AnalysisHelper.scala:91)
	at io.delta.tables.DeltaMergeBuilder.improveUnsupportedOpError(DeltaMergeBuilder.scala:123)
	at io.delta.tables.DeltaMergeBuilder.execute(DeltaMergeBuilder.scala:207)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)


In [17]:
# 12) Reset this example
oea.delete('stage1/Transactional/contoso')
oea.delete('stage2/Ingested/contoso')
oea.delete('stage2/Refined/contoso')
oea.drop_lake_db('ldb_sam_s2i_contoso_v0p1')
oea.drop_lake_db('ldb_sam_s2r_contoso_v0p1')

StatementMeta(spark3p2sm, 23, 17, Finished, Available)

2022-11-15 13:12:02,837 - OEA - INFO - Database dropped: ldb_sam_s2i_contoso_v0p1
2022-11-15 13:12:04,020 - OEA - INFO - Database dropped: ldb_sam_s2r_contoso_v0p1


'Database dropped: ldb_sam_s2r_contoso_v0p1'

# Appendix

In [None]:
# You can list the contents of a folder in the data lake like this:
print(oea.ls('stage1/Transactional/contoso/v0.1/students/delta_batch_data'))
print(oea.ls('stage2/Ingested/contoso/v0.1/students'))

In [None]:
dtbl = DeltaTable.forPath(spark, oea.to_url('stage2/Refined/contoso/v0.1/sensitive/students_lookup'))
display(dtbl.toDF())
#dtbl.delete("rundate > '2022-11-04T14:39:51'")