In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
import mlflow.sklearn
import mlflow.pyfunc
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

In [None]:
data_df = spark.table("db_pegah.default.heart_disease")
df = data_df.toPandas()
print("data shape:", df.shape)
df.head()

data shape: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
#Splitting the Dataset for Training, Validating and Testing 

X = df.drop('target' , axis=1 )
y = df['target']

x_train , x_rem , y_train , y_rem = train_test_split( X , y , train_size=0.6 , random_state=42 )
x_val , x_test , y_val , y_test = train_test_split(x_rem , y_rem , test_size=0.5 , random_state=42)

print(f"X shape: {X.shape}, y shape: {y.shape}")
print(f"X_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_val shape: {x_val.shape} , y_val shape: {y_val.shape}")
print(f"X_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

X shape: (303, 13), y shape: (303,)
X_train shape: (181, 13), y_train shape: (181,)
x_val shape: (61, 13) , y_val shape: (61,)
X_test shape: (61, 13), y_test shape: (61,)


In [None]:
#Save [x_test, y_test] as a Delta table for future batch inference. 
# This dataset will be loaded later to simulate new data for model predictions in production.

df_test = pd.concat([x_test, y_test], axis=1)
spark_df = spark.createDataFrame(df_test)
spark_df.write.format("delta").saveAsTable("db_pegah.default.heart_disease_test")

In [None]:
#for calculating metrics
def calc_metric(y_actual,y_pred,y_prob):
    accuracy = accuracy_score(y_actual,y_pred)
    precision = precision_score(y_actual,y_pred)
    recall = recall_score(y_actual,y_pred)
    f1 = f1_score(y_actual,y_pred)
    roc_auc = roc_auc_score(y_actual,y_prob)
    return accuracy , precision , recall , f1 , roc_auc

### Experiment with a Random Forest model with default hyperparameters

In [None]:
with mlflow.start_run(run_name="RF_untuned"):
    rf_model = RandomForestClassifier(random_state=42) 
    rf_model.fit(x_train, y_train)
    
    y_pred_val = rf_model.predict(x_val)
    y_prob_val = rf_model.predict_proba(x_val)[:, 1]

    accuracy, precision, recall, f1, roc_auc = calc_metric(y_val, y_pred_val, y_prob_val)

    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('roc_auc', roc_auc)

    default_params = rf_model.get_params()
    mlflow.log_params(default_params)

    signature = infer_signature(x_train, rf_model.predict(x_train))
    mlflow.sklearn.log_model(rf_model, artifact_path="random_forest_model", signature=signature)

mlflow.end_run()

In [None]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

Accuracy: 0.8524590163934426
Precision: 0.9
Recall: 0.8181818181818182
F1 Score: 0.8571428571428572
ROC AUC: 0.9594155844155845


In [None]:
feature_importances = pd.DataFrame(rf_model.feature_importances_, index=x_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
cp,0.140372
ca,0.135461
thalach,0.114911
thal,0.10311
oldpeak,0.099011
age,0.083403
trestbps,0.078869
chol,0.068868
slope,0.06137
exang,0.052132


In [None]:
# Register this baseline model 
run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = "RF_untuned"').iloc[0].run_id

model_name = "Best_Model"
model_uri = f"runs:/{run_id}/random_forest_model"  
model_version = mlflow.register_model(model_uri, model_name)
time.sleep(15)
