In [0]:
%run ./transform_data

In [0]:
# The following imports are not used in this notebook:
# from pyspark.sql.functions import lit
# from pyspark.mllib.evaluation import MulticlassMetrics
# from pyspark.ml.tuning import TrainValidationSplit
# from pyspark.sql import functions as F

from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, GBTClassificationModel, RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# databricks requires us to set SPARKML_TEMP_DFS_PATH, but this cannot be done via spark config on serverless compute so we use this workaround, same with MLFLOW_DFS_TMP
import os
os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/ml-temp"
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/ml-temp"

# Hyperparameter tuning approach
As we have a large amount of data we will take a train, validate, test approach, as we can assume that sampling effects will be minimal. For smaller data sets, where sampling bias could have an impact, a cross-validation approach would be more appropriate. 

# Model choice

In [0]:
# balance dataset
balanced_df = df_min.sampleBy("RFHLTH_adj", fractions={0: 0.25, 1: 1}, seed=10)
balanced_df.groupBy('RFHLTH_adj').count().show()

In [0]:
# split data into train and test sets
train, test =balanced_df.randomSplit(weights=[0.8,0.2], seed=200)
print(test.count())
print(train.count())

In [0]:
feature_cols=["_AGEG5YR_clean","EDUCA_clean",'_BMI5', '_SMOKER3_clean', 'DRNKANY6_clean','INCOME3_clean','num_conditions','income_adj_pov']

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

train_assembled = assembler.transform(train)

In [0]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="RFHLTH_adj",
    predictionCol="prediction",
    metricName="f1"  # preferable to accuracy as classes aren't balanced
)

In [0]:
rf = RandomForestClassifier(
    labelCol="RFHLTH_adj",
    seed=42
)
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [100, 200, 300])
             .addGrid(rf.maxDepth, [4,6,8])
             .build())

In [0]:
gbt = GBTClassifier(
    labelCol="RFHLTH_adj",
    seed=42
)
paramGridGbt = (ParamGridBuilder()
             .addGrid(gbt.maxIter, [10, 20, 30])
             .addGrid(gbt.maxDepth, [4,6,8])
             .build())

In [0]:
cv_gbt = CrossValidator(
    estimator=gbt,
    estimatorParamMaps=paramGridGbt,
    evaluator=evaluator,
    numFolds=3,  # or 5
    parallelism=4
)
cv_gbt_fit = cv_gbt.fit(train_assembled)

In [0]:
cv_rf = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,  # or 5
    parallelism=4
)
cv_rf_fit = cv_rf.fit(train_assembled)

In [0]:
rf_results = [
    (param[rf.numTrees], param[rf.maxDepth], metric)
    for param, metric in zip(paramGrid, cv_rf_fit.avgMetrics)
]
rf_df = pd.DataFrame(rf_results, columns=["Trees_or_Iters", "maxDepth", "Mean_F1"])
rf_df["Model"] = "RandomForest"

# --- GBT results ---
gbt_results = [
    (param[gbt.maxIter], param[gbt.maxDepth], metric)
    for param, metric in zip(paramGridGbt, cv_gbt_fit.avgMetrics)
]
gbt_df = pd.DataFrame(gbt_results, columns=["Trees_or_Iters", "maxDepth", "Mean_F1"])
gbt_df["Model"] = "GBT"

# Combine results
results_df = pd.concat([rf_df, gbt_df], ignore_index=True)

In [0]:
plt.figure(figsize=(8,5))
sns.lineplot(
    data=results_df,
    x="Trees_or_Iters",
    y="Mean_F1",
    hue="Model",
    style="maxDepth",
    markers=True,
    dashes=False
)
plt.title("Cross-validated F1 Comparison: Random Forest vs GBT")
plt.ylabel("Mean F1 Score")
plt.xlabel("numTrees (RF) / maxIter (GBT)")
plt.legend(title="Model / maxDepth")
plt.show()

In [0]:
# train final model
rf_optimal =  RandomForestClassifier(
    labelCol="RFHLTH_adj",
    seed=42,
    numTrees=200,
    maxDepth=8
)
final_model = rf_optimal.fit(train_assembled)

In [0]:
# save model, this is a quick save so that we do not need to retrain every time
final_model.write().overwrite().save("/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/models/rf-1")

In [0]:
# load model
final_model = RandomForestClassificationModel.load("/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/models/rf-1")

In [0]:
test_assembled = assembler.transform(test)

In [0]:
preds = final_model.transform(test_assembled)
auc = evaluator.evaluate(preds)
print("Validation AUC:", auc)

In [0]:
confusion_matrix = preds.groupBy(
    "RFHLTH_adj",  # actual label
    "prediction" # predicted label
).count().orderBy(
    "RFHLTH_adj",
    "prediction"
)

display(confusion_matrix)

# Summary
Predictive performance is limited but good enough for this exercise. Due to balancing of the training set, we have fairly balanced predictions without bias. 

In [0]:
# Log with mlflow for deployment
import mlflow
import mlflow.spark
from mlflow.models import infer_signature

example_df = test_assembled.limit(1).toPandas()
predicted_df = final_model.transform(test_assembled.limit(1)).toPandas()
signature = infer_signature(example_df, predicted_df)

mlflow.spark.log_model(final_model, "RF",signature=signature)