# OEA Demo
This notebook demonstrates the batch processing features of the OEA framework.

In [None]:
%run /OEA_py

# Incremental batches

In [None]:
# reset this example (deletes data in stage1np/example, stage2np/example, and stage2p/example)
oea.delete_data_source('example')

In [None]:
# Land the first batch of test data
df1 = spark.createDataFrame([(1,'Joe','English','2021'), (2,'Helen','English','2021')], ['id', 'name', 'language', 'school_year'])
oea.land('example', 'student', df1)
# show what's landed in stage1
df = oea.load_csv('stage1np/example/student')
df.show()

In [None]:
# ingest the first batch of test data into stage2
example_schema = [['id', 'string', 'hash'], ['name', 'string', 'mask'], ['language', 'string', 'no-op'], ['school_year', 'string', 'partition-by']]
oea.ingest_incremental_data('example', 'student', example_schema, 'school_year', 'id')

# show what's in stage2
df = oea.load_delta('stage2np/example/student_lookup')
df.show()
df = oea.load_delta('stage2p/example/student_pseudo')
df.show()

In [None]:
# land the second batch of test data
df2 = spark.createDataFrame([(3,'Elisa','Spanish','2021'), (4,'Lily','English','2021')], ['id', 'name', 'language', 'school_year'])
oea.land('example', 'student', df2)
# show the comprehensive set of data landed in stage1
df = oea.load_csv('stage1np/example/student')
df.show()

In [None]:
# ingest the second batch of test data into stage2
oea.ingest_incremental_data('example', 'student', example_schema, 'school_year', 'id')

# show what's in stage2
df = oea.load_delta('stage2np/example/student_lookup')
df.show()
df = oea.load_delta('stage2p/example/student_pseudo')
df.show()
df.printSchema()

# Delta batches

In [None]:
# reset this example (deletes data in stage1np/example, stage2np/example, and stage2p/example)
oea.delete_data_source('delta_example')

In [None]:
# Land the first batch of test data
df1 = spark.createDataFrame([(1,'Joseph','English','2021'), (2,'Helen','English','2021')], ['id', 'name', 'language', 'school_year'])
oea.land('delta_example', 'student', df1)

# show what's landed in stage1
df = oea.load_csv('stage1np/delta_example/student')
df.show()

In [None]:
# ingest the first batch of test data into stage2
example_schema = [['id', 'string', 'hash'], ['name', 'string', 'mask'], ['language', 'string', 'no-op'], ['school_year', 'string', 'partition-by']]
oea.ingest_delta_data('delta_example', 'student', example_schema, 'school_year')

# show what's in stage2
df = oea.load_delta('stage2np/delta_example/student_lookup')
df.show()
df = oea.load_delta('stage2p/delta_example/student_pseudo')
df.show()

In [None]:
# Land the second batch of test data
df2 = spark.createDataFrame([(1,'Joseph','Spanish','2021'), (3,'Elisa','Spanish','2021')], ['id', 'name', 'language', 'school_year'])
oea.land('delta_example', 'student', df2)

# show what's landed in stage1
df = oea.load_csv('stage1np/delta_example/student')
df.show()

In [None]:
# ingest the second batch of test data into stage2
oea.ingest_delta_data('delta_example', 'student', example_schema, 'school_year')

# show what's in stage2
df = oea.load_delta('stage2np/delta_example/student_lookup')
df.show()
df = oea.load_delta('stage2p/delta_example/student_pseudo')
df.show()
df.printSchema()

# Snapshot batches

In [None]:
# reset this example (deletes data in stage1np/example, stage2np/example, and stage2p/example)
oea.delete_data_source('snapshot_example')

In [None]:
# land data in stage1
df1 = spark.createDataFrame([(1,'Joseph','English','2021'), (2,'Helen','English','2021')], ['id', 'name', 'language', 'school_year'])
oea.land('snapshot_example', 'student', df1)

# show what's landed in stage1
df = oea.load_csv('stage1np/snapshot_example/student')
df.show()

In [None]:
# process data from stage1 into stage2
example_schema = [['id', 'string', 'hash'], ['name', 'string', 'mask'], ['language', 'string', 'no-op'], ['school_year', 'string', 'partition-by']]
oea.ingest_snapshot_data('snapshot_example', 'student', example_schema, 'school_year')

# show what's in stage2
df = oea.load_delta('stage2np/snapshot_example/student_lookup')
df.show()
df = oea.load_delta('stage2p/snapshot_example/student_pseudo')
df.show()
df.printSchema()

In [None]:
# land the second test data batch in stage1
df2 = spark.createDataFrame([(1,'Joseph','Spanish','2021'), (3,'Elisa','Spanish','2021')], ['id', 'name', 'language', 'school_year'])
oea.land('snapshot_example', 'student', df2)

# show what's landed in stage1
df = oea.load_csv('stage1np/snapshot_example/student')
df.show()

In [None]:
# process data from stage1 into stage2
example_schema = [['id', 'string', 'hash'], ['name', 'string', 'mask'], ['language', 'string', 'no-op'], ['school_year', 'string', 'partition-by']]
oea.ingest_snapshot_data('snapshot_example', 'student', example_schema, 'school_year')

# show what's in stage2
df = oea.load_delta('stage2np/snapshot_example/student_lookup')
df.show()
df = oea.load_delta('stage2p/snapshot_example/student_pseudo')
df.show()
df.printSchema()