# Student Attrition - Pre-Processing

This notebook demonstrates the utility of the OEA_py class notebook, by flattening the .json files landed in the Azure Machine Learning Data Lake Storage and adjust the primary keys to accomodate the OEA Workspace. 

The steps outlined below describe how this notebook is used to flatten and clean the JSON tables:

1. Set the workspace for where the Student Attrition tables are to be converted.
2. Run process model functions, (processing test.json, train.json, predict.json, predict_proba.json, global_imp.json, and local_imp.json,) to pull them from stage1/Transactional/attrition_raw, and utilize data frame functions to flatten the original JSON structure. 
3. Run pre-process attrition data function to land flatten JSON's into stage1/Transactional/attrition folder where they can be ingested by the 0_main_attrition pipeline into Stage2.

In [12]:
workspace = 'dev'
version = '0.1'

StatementMeta(spark3p3sm, 108, 12, Finished, Available)

In [13]:
%run OEA_py

StatementMeta(, 108, -1, Finished, Available)

2023-07-27 03:13:30,771 - OEA - INFO - Now using workspace: dev
2023-07-27 03:13:30,772 - OEA - INFO - OEA initialized.


In [14]:
# 1) set the workspace (this determines where in the data lake you'll be writing to and reading from).
# You can work in 'dev', 'prod', or a sandbox with any name you choose.

oea.set_workspace(workspace)

StatementMeta(spark3p3sm, 108, 14, Finished, Available)

2023-07-27 03:13:31,352 - OEA - INFO - Now using workspace: dev


In [29]:
# 2) run process model data functions to flatten JSON files and prepare them for landing in Stage 1

from pyspark.sql.functions import lit, concat, explode, col, collect_list, expr, monotonically_increasing_id
import ast

# helper functions to process raw data to tabular CSV
def process_model_data(table_path, id_prefix):

    df = spark.read.json(oea.to_url(table_path))

    run_date = df.select("rundate").collect()[0][0]

    column_names = [row.columns for row in df.select("columns").collect()][0]
    column_names = [s.replace("-", "").replace("/", "") for s in column_names]

    index_exploded = df.withColumn("index", explode(col("index")))
    index_exploded = index_exploded.withColumn("id", monotonically_increasing_id())
    index_exploded = index_exploded.select(["index", "id"])

    data_exploded =  df.select(col("data"), explode(col("data")))
    data_exploded = data_exploded.selectExpr(
        *["col[{}] as {}".format(i, column_names[i]) for i in range(len(column_names))])
    data_exploded = data_exploded.withColumn("id", monotonically_increasing_id())

    df_flat = index_exploded.join(data_exploded, "id", "outer").drop("id")
    df_flat = df_flat.withColumn("rundate", lit(run_date))

    df_flat = df_flat.withColumnRenamed("index", "id")
    df_flat = df_flat.withColumn("id", concat(lit(id_prefix), df_flat["id"]))
    
    return df_flat


def process_model_predictions(table_path):

    df = spark.read.text(oea.to_url(table_path))

    preds = df.select('value').collect()[0][0]
    preds = ast.literal_eval(preds)
    preds = [Row(index=index, value=value) for index, value in enumerate(preds)]

    df_preds = spark.createDataFrame(preds)

    df_preds = df_preds.withColumnRenamed("index", "id")
    df_preds = df_preds.withColumn("id", concat(lit('test'), df_preds["id"]))
    df_preds = df_preds.withColumnRenamed("value", "prediction")
    
    return df_preds


def process_model_probs(table_path):

    df = spark.read.text(oea.to_url(table_path))
    probs = df.select('value').collect()[0][0]
    probs = ast.literal_eval(probs)
    df_probs = spark.createDataFrame(probs)

    df_probs = df_probs.withColumn("id", monotonically_increasing_id())
    df_probs = df_probs.withColumn("id", concat(lit('test'), df_probs["id"]))

    df_probs = df_probs.withColumnRenamed("_1", "attrition_prob")
    df_probs = df_probs.withColumnRenamed("_2", "retain_prob")

    df_probs = df_probs.select("id", "attrition_prob", "retain_prob")
    
    return df_probs


def process_model_global_imp(table_path):
    
    df_feature_imp = spark.read.json(oea.to_url(table_path))

    df_feature_imp = df_feature_imp.select(["data", "rundate"])
    df_feature_imp = df_feature_imp.withColumnRenamed("data","feature_imp")
    df_feature_imp = df_feature_imp.withColumn("feature_imp", explode(col("feature_imp")))
    df_feature_imp = df_feature_imp.withColumn("id", monotonically_increasing_id())
    df_feature_imp = df_feature_imp.select(["id","feature_imp", "rundate"])

    folders = table_path.split("/")
    table_path = "/".join(folders[:-1])
    table_path = table_path+'/'+'model_features'
    df_feature_names = spark.read.json(oea.to_url(table_path))

    df_feature_names = df_feature_names.select("data")
    df_feature_names = df_feature_names.withColumnRenamed("data","feature")
    df_feature_names = df_feature_names.withColumn("feature", explode(col("feature")))
    df_feature_names = df_feature_names.withColumn("id", monotonically_increasing_id())
    df_feature_names = df_feature_names.select(["id","feature"])

    df_feature_imp = df_feature_imp.join(df_feature_names, "id", "outer")
    df_feature_imp = df_feature_imp.select(["id","feature", "feature_imp"])

    return df_feature_imp

def process_model_local_imp(table_path):

    df_feature_imp = spark.read.json(oea.to_url(table_path))

    df_feature_imp = df_feature_imp.select(["data", "rundate"])

    run_date = df_feature_imp.select("rundate").collect()[0][0]

    folders = table_path.split("/")
    table_path = "/".join(folders[:-1])
    table_path = table_path+'/'+'model_features'
    df_feature_names = spark.read.json(oea.to_url(table_path))
    column_names = [row.data for row in df_feature_names.select("data").collect()][0]
    column_names = [s.replace("-", "").replace("/", "") for s in column_names]

    df_exploded_1 = df_feature_imp.select(explode(df_feature_imp.data).alias("data"))
    df_exploded_2 = df_exploded_1.select(explode(df_exploded_1.data).alias("data"))
    num_cols = len(column_names)
    column_transformations = [
        col("data").getItem(i).alias(column_names[i]) for i in range(num_cols)
    ]
    df_exploded_final = df_exploded_2.select(column_transformations)

    df_exploded_final = df_exploded_final.withColumn("rundate", lit(run_date))
    df_exploded_final = df_exploded_final.withColumn("id", monotonically_increasing_id())
    df_exploded_final = df_exploded_final.withColumn("id", concat(lit('test'), df_exploded_final["id"]))

    df_exploded_final = df_exploded_final.select(["id", *df_exploded_final.columns[:-1]])
    df_exploded_final = df_exploded_final.limit(df_exploded_final.count()//2)

    return df_exploded_final


StatementMeta(spark3p3sm, 98, 30, Finished, Available)

In [27]:
# 3) this step pre-processing the canvas data through reading in the JSONs as records, corrects any schema discepancies and then writes out the df as a CSV in stage1
# there is no data transformation happening in this step besides properly reading in the column dtypes properly

def preprocess_attrition_data(tables_source):
    items = oea.get_folders(tables_source)
    for item in items: 
        table_path = tables_source+'/'+item
        # find the batch data type of the table
        batch_type_folder = oea.get_folders(table_path)
        batch_type = batch_type_folder[0]
        # grab only the latest folder in stage1, used to write the JSON -> CSV to the same rundate folder timestamp
        # idea is to mimic the same directory structure of tables landed in stage1
        latest_dt = oea.get_latest_runtime(f'{table_path}/{batch_type}', "rundate=%Y-%m-%d %H:%M:%S")
        if item == 'data_train':
            df = process_model_data(table_path, 'train')
            # create the new location for the converted CSVs, and write back to stage1
            new_table_path = f'stage1/Transactional/attrition/v{version}/{item}/{batch_type}/rundate={latest_dt}'
            df.coalesce(1).write.save(oea.to_url(f'{new_table_path}'), format='csv', mode='overwrite', header='true', mergeSchema='true')
            # remove the _SUCCESS file
            oea.rm_if_exists(new_table_path + '/_SUCCESS', False)
            logger.info('Pre-processed table: ' + item + ' from: ' + table_path)
        elif item == 'data_test':
            df = process_model_data(table_path, 'test')
            # create the new location for the converted CSVs, and write back to stage1
            new_table_path = f'stage1/Transactional/attrition/v{version}/{item}/{batch_type}/rundate={latest_dt}'
            df.coalesce(1).write.save(oea.to_url(f'{new_table_path}'), format='csv', mode='overwrite', header='true', mergeSchema='true')
            # remove the _SUCCESS file
            oea.rm_if_exists(new_table_path + '/_SUCCESS', False)
            logger.info('Pre-processed table: ' + item + ' from: ' + table_path)
        elif item == 'predictions_predict':
            df = process_model_predictions(table_path)
            # create the new location for the converted CSVs, and write back to stage1
            new_table_path = f'stage1/Transactional/attrition/v{version}/{item}/{batch_type}/rundate={latest_dt}'
            df.coalesce(1).write.save(oea.to_url(f'{new_table_path}'), format='csv', mode='overwrite', header='true', mergeSchema='true')
            # remove the _SUCCESS file
            oea.rm_if_exists(new_table_path + '/_SUCCESS', False)
            logger.info('Pre-processed table: ' + item + ' from: ' + table_path)
        elif item == 'predictions_predict_proba':
            df = process_model_probs(table_path)
            # create the new location for the converted CSVs, and write back to stage1
            new_table_path = f'stage1/Transactional/attrition/v{version}/{item}/{batch_type}/rundate={latest_dt}'
            df.coalesce(1).write.save(oea.to_url(f'{new_table_path}'), format='csv', mode='overwrite', header='true', mergeSchema='true')
            # remove the _SUCCESS file
            oea.rm_if_exists(new_table_path + '/_SUCCESS', False)
            logger.info('Pre-processed table: ' + item + ' from: ' + table_path)
        elif item == 'model_global_importance_values':
            df = process_model_global_imp(table_path)
            # create the new location for the converted CSVs, and write back to stage1
            new_table_path = f'stage1/Transactional/attrition/v{version}/{item}/{batch_type}/rundate={latest_dt}'
            df.coalesce(1).write.save(oea.to_url(f'{new_table_path}'), format='csv', mode='overwrite', header='true', mergeSchema='true')
            # remove the _SUCCESS file
            oea.rm_if_exists(new_table_path + '/_SUCCESS', False)
            logger.info('Pre-processed table: ' + item + ' from: ' + table_path)
        elif item == 'model_local_importance_values':
            df = process_model_local_imp(table_path)
            # create the new location for the converted CSVs, and write back to stage1
            new_table_path = f'stage1/Transactional/attrition/v{version}/{item}/{batch_type}/rundate={latest_dt}'
            df.coalesce(1).write.save(oea.to_url(f'{new_table_path}'), format='csv', mode='overwrite', header='true', mergeSchema='true')
            # remove the _SUCCESS file
            oea.rm_if_exists(new_table_path + '/_SUCCESS', False)
            logger.info('Pre-processed table: ' + item + ' from: ' + table_path)
        else:
            logger.info(f'no ad hoc processing needed for the Attrition {item} table.')
    logger.info('Finished pre-processing Attrition tables')

StatementMeta(spark3p3sm, 98, 28, Finished, Available)

In [30]:
# set the version number and pre-process the dataset
preprocess_attrition_data(f'stage1/Transactional/attrition_raw/v{version}')

StatementMeta(spark3p3sm, 98, 31, Finished, Available)

data_test
2023-07-27 01:48:27
2023-07-27 02:16:20,184 - OEA - INFO - Pre-processed table: data_test from: stage1/Transactional/attrition_raw/v0.1/data_test
data_train
2023-07-27 01:48:26
2023-07-27 02:16:23,430 - OEA - INFO - Pre-processed table: data_train from: stage1/Transactional/attrition_raw/v0.1/data_train
model_features
2023-07-27 01:48:26
2023-07-27 02:16:23,495 - OEA - INFO - no ad hoc processing needed for the Attrition model_features table.
model_global_importance_values
2023-07-27 01:48:27
2023-07-27 02:16:25,917 - OEA - INFO - Pre-processed table: model_global_importance_values from: stage1/Transactional/attrition_raw/v0.1/model_global_importance_values
model_local_importance_values
2023-07-27 01:48:26
2023-07-27 02:16:29,835 - OEA - INFO - Pre-processed table: model_local_importance_values from: stage1/Transactional/attrition_raw/v0.1/model_local_importance_values
predictions_predict
2023-07-27 01:48:26
2023-07-27 02:16:36,635 - OEA - INFO - Pre-processed table: predicti

## Test Section

Below functions were used when developing and testing the above notebook.

In [17]:
# used for testing

#oea.rm_if_exists('stage1/Transactional/attrition')
#oea.rm_if_exists('stage1/Transactional/attrition_raw')
#oea.rm_if_exists('stage2/Ingested/attrition')
#oea.rm_if_exists('stage2/Refined/attrition')
#oea.drop_lake_db('ldb_dev_s2i_attrition_v0p1')
#oea.drop_lake_db('ldb_dev_s2r_attrition_v0p1')

StatementMeta(spark3p3sm, 108, 17, Finished, Available)

2023-07-27 03:25:50,239 - OEA - INFO - Database dropped: ldb_dev_s2i_attrition_v0p1
2023-07-27 03:25:50,314 - OEA - INFO - Database dropped: ldb_dev_s2r_attrition_v0p1


'Database dropped: ldb_dev_s2r_attrition_v0p1'

In [16]:
# used for testing

#metadata = oea.create_metadata_from_lake_db('ldb_dev_s2i_attrition_v0p1')
#print(metadata)
#dlw = DataLakeWriter(oea.to_url('stage1/Transactional/attrition'))
#dlw.write('metadata.csv', metadata)


StatementMeta(spark3p3sm, 108, 16, Finished, Available)

Entity Name,Attribute Name,Attribute Data Type,Pseudonymizationmodel_global_importance_values,,,
,id,string,no-op
,feature,string,no-op
,feature_imp,string,no-op
data_train,,,
,id,string,no-op
,FirstGenerationinCollegeFlag,string,no-op
,Gender,string,no-op
,Race,string,no-op
,HSGraduateorGED,string,no-op
,Age_Term_Min,string,no-op
,Age_Term_Max,string,no-op
,Total_Terms,string,no-op
,Entry_Type_DualEnrollment,string,no-op
,Entry_Type_EarlyAdmission,string,no-op
,Entry_Type_FirstTimeinCollege,string,no-op
,Entry_Type_ReEntry,string,no-op
,Entry_Type_Transfer,string,no-op
,AcademicProbation,string,no-op
,AcademicSuspension,string,no-op
,GoodAcademicStanding,string,no-op
,ProbationAfterSuspenDismiss,string,no-op
,TransferedToNonBusiness,string,no-op
,CumulativeGPA,string,no-op
,CumulativeCreditHoursEarnedPerTerm,string,no-op
,Blended,string,no-op
,FullyOnline,string,no-op
,RemoteLearning,string,no-op
,RemoteLearningBlended,string,no-op
,Traditional,string,no-op
,Adjunct,string,no-op
,Facu