# Tests for the OEA framework

In [1]:
%run OEA_0p7_py

StatementMeta(, 32, -1, Finished, Available)

In [2]:
oea = OEA('stoeacisd3927b', '1234')
oea.use_workspace('sandbox1')

StatementMeta(spark3p2sm, 32, 2, Finished, Available)

2022-10-10 03:39:59,700 - OEA - INFO - OEA initialized.
2022-10-10 03:39:59,700 - OEA - INFO - Now using workspace: sandbox1


In [18]:
def reset_additive_data_tests():
    oea.rm_if_exists('stage1/Transactional/contoso_sis/v0.1/studentattendance')
    oea.rm_if_exists('stage2/Ingested/contoso_sis/v0.1/studentattendance')

def land_studentattendance_day1(expected_record_count):
    path = oea.land_data_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/studentattendance/part1.csv', 'stage1/Transactional/contoso_sis/v0.1/studentattendance/additive_batch_data')
    df = oea.load_csv(path)
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

def land_studentattendance_day2(expected_record_count):
    path = oea.land_data_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day2/studentattendance/part1.csv', 'stage1/Transactional/contoso_sis/v0.1/studentattendance/additive_batch_data')
    df = oea.load_csv(path)
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

def ingest_studentattendance(expected_record_count):
    oea.ingest('stage1/Transactional/contoso_sis/v0.1/studentattendance')
    df = oea.load('stage2/Ingested/contoso_sis/v0.1/studentattendance')
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

reset_additive_data_tests()
# test1 - Land the first batch of studentattendance data
land_studentattendance_day1(1464)
# test2 - Ingest the data from stage1 into stage2
ingest_studentattendance(1464)
# test3 - run the same ingestion a second time and verify that it doesn't change what was ingested (ingestion is idempotent via use of _checkpoints)
ingest_studentattendance(1464)
# test4 - Land the second batch of studentattendance data
land_studentattendance_day2(2928)
# test5 - Ingest the data from stage1 into stage2
ingest_studentattendance(4392)



In [19]:
def reset_delta_data_tests():
    oea.rm_if_exists('stage1/Transactional/contoso_sis/v0.1/students')
    oea.rm_if_exists('stage2/Ingested/contoso_sis/v0.1/students')

def land_students_day1(expected_record_count):
    path = oea.land_data_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/students/part1.csv', 'stage1/Transactional/contoso_sis/v0.1/students/delta_batch_data')
    df = oea.load_csv(path)
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

def land_students_day2(expected_record_count):
    path = oea.land_data_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day2/students/part1.csv', 'stage1/Transactional/contoso_sis/v0.1/students/delta_batch_data')
    df = oea.load_csv(path)
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

def ingest_students(expected_record_count):
    oea.ingest('stage1/Transactional/contoso_sis/v0.1/students', 'SIS_ID')
    df = oea.load('stage2/Ingested/contoso_sis/v0.1/students')
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

reset_delta_data_tests()
# test1 - Land the first batch of studentattendance data
land_students_day1(1)
# test2 - Ingest the data from stage1 into stage2
ingest_students(1)
# test3 - run the same ingestion a second time and verify that it doesn't change what was ingested (ingestion is idempotent via use of _checkpoints)
ingest_students(1)
# test4 - Land the second batch of studentattendance data
land_students_day2(2)
# test5 - Ingest the data from stage1 into stage2
ingest_students(2)

In [20]:
def reset_snapshot_data_tests():
    oea.rm_if_exists('stage1/Transactional/contoso_sis/v0.1/studentsectionmark')
    oea.rm_if_exists('stage2/Ingested/contoso_sis/v0.1/studentsectionmark')

def land_studentsectionmark_day1(expected_record_count):
    path = oea.land_data_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day1/studentsectionmark/part1.csv', 'stage1/Transactional/contoso_sis/v0.1/studentsectionmark/delta_batch_data')
    df = oea.load_csv(path)
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

def land_studentsectionmark_day2(expected_record_count):
    path = oea.land_data_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/framework/test_data/contoso_sis/day2/studentsectionmark/part1.csv', 'stage1/Transactional/contoso_sis/v0.1/studentsectionmark/delta_batch_data')
    df = oea.load_csv(path)
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

def ingest_studentsectionmark(expected_record_count):
    oea.ingest('stage1/Transactional/contoso_sis/v0.1/studentsectionmark')
    df = oea.load('stage2/Ingested/contoso_sis/v0.1/studentsectionmark')
    assert df.count() == expected_record_count, f'Expected {expected_record_count} records in landed data, but found {df.count()}'

reset_snapshot_data_tests()
# test1 - Land the first batch of studentattendance data
land_studentsectionmark_day1(12)
# test2 - Ingest the data from stage1 into stage2
ingest_studentsectionmark(12)
# test3 - run the same ingestion a second time and verify that it doesn't change what was ingested (ingestion is idempotent via use of _checkpoints)
ingest_studentsectionmark(12)
# test4 - Land the second batch of studentattendance data
land_studentsectionmark_day2(24)
# test5 - Ingest the data from stage1 into stage2
ingest_studentsectionmark(24)

In [None]:
def refine_contoso_sis(df_source):
    metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/gene/v0.7dev/modules/module_catalog/Student_and_School_Data_Systems/metadata.csv')
    #oea.upsert(df_source, 'stage2/Refined/contoso_sis/v0.1/general/studentattendance')
    df_pseudo, df_lookup = oea.pseudonymize(df_source, metadata['studentattendance'])
    oea.upsert(df_pseudo, 'stage2/Refined/contoso_sis/v0.1/general/studentattendance')
    oea.upsert(df_lookup, 'stage2/Refined/contoso_sis/v0.1/sensitive/studentattendance')

oea.process('stage2/Ingested/contoso_sis/v0.1/studentattendance', refine_contoso_sis)

# query a sample of the data refined into stage2/refined
oea.display('stage2/Refined/contoso_sis/v0.1/general/studentattendance')
oea.display('stage2/Refined/contoso_sis/v0.1/sensitive/studentattendance')