# Reading Progress Module Ingestion - Schema Correction

This notebook demonstrates the utility of the OEA_py class notebook, while correcting module tables initially ingested without headers and incorrect data types.

Tables are read from ```stage2/Ingested/reading_progress/v0.1``` and written out, with the corrected schema, to ```stage2/Ingested_Corrected/reading_progress/v0.1```

The steps outlined below describe how this notebook is used to correct the Microsoft Education Insights module tables:
- Set the workspace for where the table schemas are to be corrected. 
- 4 functions are defined and used:
   1. **_extract_element**: uses the Insights metadata to extract the correct column names.
   2. **_dtype_config**: uses the Insights metadata to extract the correct column dtypes.
   3. **correct_insights_table_schema**: uses the corrected column names and dtypes to correct the schema per table given to the function.
   4. **correct_reading_progress_dataset**: extracts the names of all the folders currently stored in stage2/Ingested/reading_progress, corrects the schema per table using the function above, and overwrites the tables with the updated schemas.
   

In [None]:
workspace = 'dev'

In [None]:
%run OEA_py

In [None]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.
# For example, Sam the developer can create a 'sam' workspace and expect to find his datasets in the data lake under oea/sandboxes/sam
oea.set_workspace(workspace)

In [None]:
# 2) schema correction, since Insights data initially landed doesn't have column headers
def _extract_element(lst, element_num=0):
    return [item[element_num] for item in lst]

def _dtype_config(dtype_lst):
    return [item.capitalize() + 'Type()' for item in dtype_lst]

def correct_insights_table_schema(df, table_name):
    list_of_column_names = _extract_element(metadata[table_name])
    list_of_column_dtypes = _extract_element(metadata[table_name], 1)
    list_of_column_dtypes = _dtype_config(list_of_column_dtypes)

    n = 0
    df_updatedColumns = df
    for c in df.columns:
        if c != 'rundate':
            new_col_name = list_of_column_names[n]
            df_updatedColumns = df_updatedColumns.withColumnRenamed(c, new_col_name)
            if list_of_column_dtypes[n] != 'StringType()':
                if list_of_column_dtypes[n] == 'IntegerType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(IntegerType()))
                elif list_of_column_dtypes[n] == 'TimestampType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(TimestampType()))
                elif list_of_column_dtypes == 'ShortType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(ShortType()))
                elif list_of_column_dtypes[n] == 'LongType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(LongType()))
                elif list_of_column_dtypes[n] == 'DoubleType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(DoubleType()))
                elif list_of_column_dtypes[n] == 'DateType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(DateType()))
                elif list_of_column_dtypes[n] == 'BooleanType()':
                    df_updatedColumns = df_updatedColumns.withColumn(new_col_name, df_updatedColumns[new_col_name].cast(BooleanType()))
        else:
            df_updatedColumns = df_updatedColumns
        n = n + 1
    return df_updatedColumns

In [None]:
def correct_reading_progress_dataset(tables_source, write_destination):
    items = oea.get_folders(tables_source)
    for item in items: 
        if item == 'metadata.csv':
            logger.info('ignore metadata processing, since this is not a table to be ingested')
        else:
            table_path = tables_source +'/'+ item
            spark.sql("set spark.sql.streaming.schemaInference=true")
            streaming_df = spark.readStream.format('delta').load(oea.to_url(table_path))
            df_corrected = correct_insights_table_schema(streaming_df, table_name=item)
            query = df_corrected.writeStream.format('delta').outputMode('append').trigger(once=True).option('checkpointLocation', oea.to_url(table_path) + '/_checkpoints')
            query = query.start(oea.to_url(write_destination + '/' +item))
            query.awaitTermination() 
            logger.info('Successfully corrected the schema for table: ' + item + ' from: ' + table_path)

In [None]:
metadata = oea.get_metadata_from_url('https://raw.githubusercontent.com/microsoft/OpenEduAnalytics/main/modules/module_catalog/Microsoft_Education_Insights/test_data/metadata.csv')
correct_reading_progress_dataset('stage2/Ingested/reading_progress/v0.1', 'stage2/Ingested_Corrected/reading_progress/v0.1')