# Delta for ML Practitioners

## Setup & Cleanup

In [0]:
# Clean prior run data files
dbutils.fs.rm('/tmp/ch-9/', True)

# Drop & recreate database
spark.sql("DROP DATABASE IF EXISTS ch_9 CASCADE")
spark.sql("CREATE DATABASE ch_9 ")
spark.sql("USE ch_9")

# Configure Path
DELTALAKE_BRONZE_PATH = "/tmp/ch-9/bronze/loan_raw"
DELTALAKE_SILVER_PATH = "/tmp/ch-9/silver/loan_refined"
# Remove table if it exists
dbutils.fs.rm(DELTALAKE_BRONZE_PATH, recurse=True)
dbutils.fs.rm(DELTALAKE_SILVER_PATH, recurse=True)

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

schema = StructType([ \
    StructField("client_id", StringType(),True), \
    StructField("loan_status", StringType(),True), \
    StructField("int_rate",StringType(),True), \
    StructField("issue_dt",StringType(),True), \
    StructField("payment_amt", FloatType(), True), \
    StructField("loan_amnt", FloatType(), True), \
    StructField("addr_state", StringType(), True),
    StructField("term", StringType(), True), \
    StructField("ownership", StringType(), True) \
  ])
strutured_data = [('A123', 'Paid', '13.3%', 'Feb-2020', 2000.0, 2000.0, 'MA', '36 months', 'MORTGAGE'),
                ('B678', 'Default', '15.0%', 'Nov-2022', 2000.0, 5000.0, 'DE', '16 months','RENT'),
                ('C566', 'Not-Paid', '10.5%', 'Dec-2022', 0.0, 7000.0, 'WA', '24 months', 'MORTGAGE'),
                ('Z111', 'Paid', '1.3%', 'Oct-2020', 2000.0, 2000.0, 'MA', '36 months', 'MORTGAGE'),
                ('L231', 'Default', '5.0%', 'Oct-2020', 2000.0, 5000.0, 'DE', '16 months','RENT'),
                ('C890','Not-Paid', '10.1%', 'Aug-2020', 0.0, 7000.0, 'WA', '24 months', 'MORTGAGE')
               ]
df = spark.createDataFrame(data=strutured_data,schema=schema)
df.printSchema()
df.write.format("delta").mode("append").save(DELTALAKE_BRONZE_PATH)
spark.sql("CREATE TABLE loan_raw_data USING DELTA LOCATION '" + DELTALAKE_BRONZE_PATH + "'")

In [0]:
strutured_data = [('P123', 'Paid', '13.3%', 'Feb-2021', 1000.0, 2000.0, 'MA', '12 months', 'MORTGAGE'),
                ('Q678', 'Default', '15.0%', 'Nov-2021', 3000.0, 5000.0, 'DE', '12 months','RENT'),
                ('R566', 'Not-Paid', '10.5%', 'Dec-2021', 0.0, 7000.0, 'WA', '12 months', 'MORTGAGE'),
                ('S111', 'Paid', '1.3%', 'Oct-2021', 8000.0, 2000.0, 'MA', '12 months', 'MORTGAGE'),
               ]
df = spark.createDataFrame(data=strutured_data,schema=schema)
df.write.format("delta").mode("append").save(DELTALAKE_BRONZE_PATH)

# Delta backed Model Management

## 1. Data Preparation
* EDA
* Featurization

In [0]:
df = spark.sql('SELECT * FROM loan_raw_data VERSION AS OF 0')
display(df.summary())

summary,client_id,loan_status,int_rate,issue_dt,payment_amt,loan_amnt,addr_state,term,ownership
count,6,6,6,6,6.0,6.0,6,6,6
mean,,,,,1333.3333333333333,4666.666666666667,,,
stddev,,,,,1032.7955589886444,2250.925735484551,,,
min,A123,Default,1.3%,Aug-2020,0.0,2000.0,DE,16 months,MORTGAGE
25%,,,,,0.0,2000.0,,,
50%,,,,,2000.0,5000.0,,,
75%,,,,,2000.0,7000.0,,,
max,Z111,Paid,5.0%,Oct-2020,2000.0,7000.0,WA,36 months,RENT


In [0]:
# Select only the columns needed
loan_stats = df.select("loan_status", "int_rate", "issue_dt", "payment_amt", "loan_amnt", "addr_state", "term", "ownership")

### Delta backed Feature Store table
* Stores expensive features that are computed once and used across use cases
* Used at both training and inferencing time to prevent drift
* Allows for feature trend comparison
* Allows for rollback to a prior version

In [0]:
from pyspark.sql.functions import *

print("------------------------------------------------------------------------------------------------")
print("Create bad loan label, this will include charged off, defaulted, and late repayments on loans...")
loan_stats = loan_stats.filter(loan_stats.loan_status.isin(["Default", "Not-Paid", "Paid"]))\
                       .withColumn("bad_loan", (~(loan_stats.loan_status == "Paid")).cast("string"))

print("------------------------------------------------------------------------------------------------")
print("Turning string interest rate and revoling util columns into numeric columns...")
loan_stats = loan_stats.withColumn('int_rate', regexp_replace('int_rate', '%', '').cast('float')) \
                       .withColumn('issue_year',  substring(loan_stats.issue_dt, 5, 4).cast('double') ) 

In [0]:
# Save table as Delta Lake
loan_stats.write.format("delta").mode("overwrite").save(DELTALAKE_SILVER_PATH)

# Re-read as Delta Lake
loan_stats = spark.read.format("delta").load(DELTALAKE_SILVER_PATH)

# Review data
display(loan_stats)

loan_status,int_rate,issue_dt,payment_amt,loan_amnt,addr_state,term,ownership,bad_loan,issue_year
Paid,1.3,Oct-2020,2000.0,2000.0,MA,36 months,MORTGAGE,False,2020.0
Not-Paid,10.1,Aug-2020,0.0,7000.0,WA,24 months,MORTGAGE,True,2020.0
Not-Paid,10.5,Dec-2022,0.0,7000.0,WA,24 months,MORTGAGE,True,2022.0
Paid,13.3,Feb-2020,2000.0,2000.0,MA,36 months,MORTGAGE,False,2020.0
Default,5.0,Oct-2020,2000.0,5000.0,DE,16 months,RENT,True,2020.0
Default,15.0,Nov-2022,2000.0,5000.0,DE,16 months,RENT,True,2022.0


## 2. Model Preparation
* Training data/version
* Model Metrics
* Drift Thresholds

### Consistent dataset for all ML Experiments to ensure fair comparison
* Use Delta Time Travel capabilities instead of making multiple versions of the data
* Log the data version used for each training run
  * MLFlow is a good option to track model management: https://mlflow.org/

### Building a ML Pipeline with Delta
* Multi-hop
* Combine real-time streaming data with historical data in the same Delta table to be used for training
* Combine structured, semi-structured and un-structurd data into same Delta table for training
* Data Cleansing, transformations on Delta table using Spark APIs
* Schema Enforcement and Schemaa Evolution

In [0]:
spark.sql("DROP TABLE IF EXISTS loan_stats")
spark.sql("CREATE TABLE loan_stats USING DELTA LOCATION '" + DELTALAKE_SILVER_PATH + "'")

In [0]:
# Add the mergeSchema option
loan_stats.write.option("mergeSchema","true").format("delta").mode("overwrite").save(DELTALAKE_SILVER_PATH)

In [0]:
myY = "bad_loan"
categoricals = ["term", "ownership", "addr_state"]
numerics = ["loan_amnt","payment_amt"]
myX = categoricals + numerics

loan_stats2 = loan_stats.select(myX + [myY, "int_rate", "issue_year"])
train = loan_stats2.filter(loan_stats2.issue_year <= 2020).cache()
valid = loan_stats2.filter(loan_stats2.issue_year > 2020).cache()

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.feature import StandardScaler, Imputer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## Current possible ways to handle categoricals in string indexer is 'error', 'keep', and 'skip'
indexers = map(lambda c: StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid = 'keep'), categoricals)
ohes = map(lambda c: OneHotEncoder(inputCol=c + "_idx", outputCol=c+"_class"),categoricals)
imputers = Imputer(inputCols = numerics, outputCols = numerics)

# Establish features columns
featureCols = list(map(lambda c: c+"_class", categoricals)) + numerics

# Build the stage for the ML pipeline
# Build the stage for the ML pipeline
model_matrix_stages = list(indexers) + list(ohes) + [imputers] + \
                     [VectorAssembler(inputCols=featureCols, outputCol="features"), StringIndexer(inputCol="bad_loan", outputCol="label")]

# Apply StandardScaler to create scaledFeatures
scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=True)

# Use logistic regression 
lr = LogisticRegression(maxIter=10, elasticNetParam=0.5, featuresCol = "scaledFeatures")

# Build our ML pipeline
pipeline = Pipeline(stages=model_matrix_stages+[scaler]+[lr])

# Build the parameter grid for model tuning
paramGrid = ParamGridBuilder() \
              .addGrid(lr.regParam, [0.1, 0.01]) \
              .build()

# Execute CrossValidator for model tuning
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5)

# Train the tuned model and establish our best model
cvModel = crossval.fit(train)
glm_model = cvModel.bestModel

# Return ROC
lr_summary = glm_model.stages[len(glm_model.stages)-1].summary
display(lr_summary.roc)

FPR,TPR
0.0,0.0
0.0,1.0
0.3333333333333333,1.0
1.0,1.0
1.0,1.0


## 3. Model Serving
* Testing and comparison against different architecture types
* Deployment/Serving
* Inferencing

In [0]:
display(glm_model.transform(valid))

term,ownership,addr_state,loan_amnt,payment_amt,bad_loan,int_rate,issue_year,term_idx,ownership_idx,addr_state_idx,term_class,ownership_class,addr_state_class,features,label,scaledFeatures,rawPrediction,probability,prediction
16 months,RENT,DE,5000.0,2000.0,True,15.0,2022.0,2.0,1.0,2.0,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 10, indices -> List(2, 4, 7, 8, 9), values -> List(1.0, 1.0, 1.0, 5000.0, 2000.0))",0.0,"Map(vectorType -> dense, length -> 10, values -> List(-0.7302967433402215, -0.7302967433402215, 1.788854381999832, -1.788854381999832, 1.788854381999832, -0.7302967433402215, -0.7302967433402215, 1.788854381999832, 0.15936381457791912, 0.7302967433402214))","Map(vectorType -> dense, length -> 2, values -> List(2.0477153332452165, -2.0477153332452165))","Map(vectorType -> dense, length -> 2, values -> List(0.8857165622211622, 0.11428343777883776))",0.0


In [0]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.linalg import Vectors

def extract(row):
  return  tuple(row.probability.toArray().tolist()) +  (row.label,) + (row.prediction,)

def score(model,data):
  pred = model.transform(data).select( "probability", "label", "prediction")
  pred = pred.rdd.map(extract).toDF(["p0", "p1", "label", "prediction"])
  return pred 

def auc(pred):
  metric = BinaryClassificationMetrics(pred.select("p1", "label").rdd)
  return metric.areaUnderROC

glm_train = score(glm_model, train)
glm_valid = score(glm_model, valid)

glm_train.createOrReplaceTempView("glm_train")
glm_valid.createOrReplaceTempView("glm_valid")

print ("GLM Training AUC:" + str(auc(glm_train)))
print ("GLM Validation AUC :" + str(auc(glm_valid)))

## 4. Model Monitoring
* Drift Detection
  * Data/Feature
  * Model

In [0]:
df0 = spark.sql('SELECT * FROM loan_raw_data VERSION AS OF 0')
display(df0.summary())

summary,client_id,loan_status,int_rate,issue_dt,payment_amt,loan_amnt,addr_state,term,ownership
count,6,6,6,6,6.0,6.0,6,6,6
mean,,,,,1333.3333333333333,4666.666666666667,,,
stddev,,,,,1032.7955589886444,2250.925735484551,,,
min,A123,Default,1.3%,Aug-2020,0.0,2000.0,DE,16 months,MORTGAGE
25%,,,,,0.0,2000.0,,,
50%,,,,,2000.0,5000.0,,,
75%,,,,,2000.0,7000.0,,,
max,Z111,Paid,5.0%,Oct-2020,2000.0,7000.0,WA,36 months,RENT


In [0]:
df1 = spark.sql('SELECT * FROM loan_raw_data VERSION AS OF 1')
display(df1.summary())

summary,client_id,loan_status,int_rate,issue_dt,payment_amt,loan_amnt,addr_state,term,ownership
count,10,10,10,10,10.0,10.0,10,10,10
mean,,,,,2000.0,4400.0,,,
stddev,,,,,2357.0226039551585,2221.1108331943574,,,
min,A123,Default,1.3%,Aug-2020,0.0,2000.0,DE,12 months,MORTGAGE
25%,,,,,0.0,2000.0,,,
50%,,,,,2000.0,5000.0,,,
75%,,,,,2000.0,7000.0,,,
max,Z111,Paid,5.0%,Oct-2021,8000.0,7000.0,WA,36 months,RENT
