In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import mlflow
import kagglehub
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from mlflow.models.signature import infer_signature
from urllib.parse import urlparse
from imblearn.over_sampling import SMOTE

mlflow.set_tracking_uri("http://127.0.0.1:5001")
mlflow.set_experiment("Model_Comparison_LC")

### Download Data

In [2]:
path = kagglehub.dataset_download("rainelai/lendingclubs-bank-loan-default-dataset")
data_path = os.path.join(path, "data")
files = os.listdir(data_path)
csv_file = [f for f in files if f.endswith("trainingset.csv")][0]

dev = pd.read_csv(os.path.join(data_path, csv_file))
dev['Bad_Flag'] = [1 if dev['loan_status'].iloc[i] == 'Charged Off' else 0 for i in range(len(dev))]
dev = dev.select_dtypes(include=['number'])
dev = dev.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(dev.drop(columns=['Bad_Flag'], axis = 1), dev['Bad_Flag'])



### Imbalanced Dataset Run

In [None]:
path = kagglehub.dataset_download("rainelai/lendingclubs-bank-loan-default-dataset")
data_path = os.path.join(path, "data")
files = os.listdir(data_path)
csv_file = [f for f in files if f.endswith("trainingset.csv")][0]

dev = pd.read_csv(os.path.join(data_path, csv_file))
dev['Bad_Flag'] = [1 if dev['loan_status'].iloc[i] == 'Charged Off' else 0 for i in range(len(dev))]
dev = dev.select_dtypes(include=['number'])
dev = dev.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(dev.drop(columns=['Bad_Flag'], axis = 1), dev['Bad_Flag'])

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=2),
    "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8, eval_metric="logloss", random_state=2),
    "NeuralNet": MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=300, random_state=2)
}

with mlflow.start_run(run_name="Model_Comparison_LC"):  # parent (NO nested flag)
    
    for model_name, model in models.items():

        with mlflow.start_run(run_name=model_name, nested=True):  # child runs

            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            probs = model.predict_proba(X_test)[:, 1]

            mlflow.log_params(model.get_params())
            mlflow.log_metric("accuracy", accuracy_score(y_test, preds))
            mlflow.log_metric("f1_score", f1_score(y_test, preds))
            mlflow.log_metric("roc_auc", roc_auc_score(y_test, probs))
            mlflow.log_metric("precision", precision_score(y_test, preds))
            mlflow.log_metric("recall", recall_score(y_test, preds))

            signature = infer_signature(X_train, model.predict(X_train))
            input_example = X_train[:5]

            mlflow.sklearn.log_model(
                model,
                artifact_path=model_name,
                registered_model_name=model_name,
                signature=signature,
                input_example=input_example
            )



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2025/10/30 23:08:36 INFO mlflow.models.model: Found the following environment variables used during model inference: [HF_API_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'LogisticRegression'.
2025/10/30 23:08:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 1
Created version '1' of model 'LogisticRegression'.


🏃 View run LogisticRegression at: http://127.0.0.1:5001/#/experiments/1/runs/25f4390de0824980baa74e177e05ed0d
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Successfully registered model 'RandomForest'.
2025/10/30 23:10:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest, version 1
Created version '1' of model 'RandomForest'.


🏃 View run RandomForest at: http://127.0.0.1:5001/#/experiments/1/runs/7c7f81f9a81f45aca76cac9298fa0a6d
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Successfully registered model 'XGBoost'.
2025/10/30 23:10:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost, version 1
Created version '1' of model 'XGBoost'.


🏃 View run XGBoost at: http://127.0.0.1:5001/#/experiments/1/runs/db27b733926242db9ec996bf344b776d
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'NeuralNet'.
2025/10/30 23:11:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NeuralNet, version 1


🏃 View run NeuralNet at: http://127.0.0.1:5001/#/experiments/1/runs/21e6f28d0ef047e28d416bd8e0eed14b
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1
🏃 View run Model_Comparison_LC at: http://127.0.0.1:5001/#/experiments/1/runs/8844a613e0dd4654958a83dba5c0d2a9
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Created version '1' of model 'NeuralNet'.


### SMOTE Run

In [None]:
smote = SMOTE(random_state=2)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

with mlflow.start_run(run_name="Model_Comparison_SMOTE"):

    for model_name, model in models.items():
        with mlflow.start_run(run_name=f"{model_name}_SMOTE", nested=True):

            # Train on SMOTE-balanced data
            model.fit(X_train_sm, y_train_sm)

            # Evaluate on original (Non-SMOTE) test data
            preds = model.predict(X_test)
            probs = model.predict_proba(X_test)[:,1]

            mlflow.log_params(model.get_params())
            mlflow.log_metric("accuracy", accuracy_score(y_test, preds))
            mlflow.log_metric("f1_score", f1_score(y_test, preds))
            mlflow.log_metric("roc_auc", roc_auc_score(y_test, probs))
            mlflow.log_metric("precision", precision_score(y_test, preds))
            mlflow.log_metric("recall", recall_score(y_test, preds))

            signature = infer_signature(X_train_sm, model.predict(X_train_sm))
            input_example = X_train_sm[:5]

            mlflow.sklearn.log_model(
                model,
                artifact_path=model_name+"_SMOTE",
                registered_model_name=model_name+"_SMOTE",
                signature=signature,
                input_example=input_example
            )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Successfully registered model 'LogisticRegression_SMOTE'.
2025/10/31 10:26:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_SMOTE, version 1
Created version '1' of model 'LogisticRegression_SMOTE'.


🏃 View run LogisticRegression_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/af91d79785ed4b019a6cc35ede7ed1a6
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Successfully registered model 'RandomForest_SMOTE'.
2025/10/31 10:29:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest_SMOTE, version 1
Created version '1' of model 'RandomForest_SMOTE'.


🏃 View run RandomForest_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/5e7ff698c76041a0a70fbf9a266b102f
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Successfully registered model 'XGBoost_SMOTE'.
2025/10/31 10:29:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost_SMOTE, version 1
Created version '1' of model 'XGBoost_SMOTE'.


🏃 View run XGBoost_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/a0a3a4b6b83746c49311bf25d63b63ce
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'NeuralNet_SMOTE'.
2025/10/31 10:30:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NeuralNet_SMOTE, version 1


🏃 View run NeuralNet_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/742a3185cb3641aa9b903e1ccf067eb3
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1
🏃 View run Model_Comparison_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/6e16fb69edfb4ff3bb75dd68d6f90a56
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Created version '1' of model 'NeuralNet_SMOTE'.


### RF-Balanced & XGBoost-Scaled Runs
#### Executed on Non-SMOTE Data

In [9]:
# Needed for XGB
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "RandomForest": RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=2),
    "XGBoost": XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8, eval_metric="logloss", scale_pos_weight=scale_pos_weight, random_state=2),
    "NeuralNet": MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=300, random_state=2)
}

with mlflow.start_run(run_name="Model_Comparison_SMOTE"):

    for model_name, model in models.items():
        with mlflow.start_run(run_name=f"{model_name}_SMOTE", nested=True):

            # Train on SMOTE-balanced data
            model.fit(X_train, y_train)

            # Evaluate on original (Non-SMOTE) test data
            preds = model.predict(X_test)
            probs = model.predict_proba(X_test)[:,1]

            mlflow.log_params(model.get_params())
            mlflow.log_metric("accuracy", accuracy_score(y_test, preds))
            mlflow.log_metric("f1_score", f1_score(y_test, preds))
            mlflow.log_metric("roc_auc", roc_auc_score(y_test, probs))
            mlflow.log_metric("precision", precision_score(y_test, preds))
            mlflow.log_metric("recall", recall_score(y_test, preds))

            signature = infer_signature(X_train, model.predict(X_train))
            input_example = X_train[:5]

            mlflow.sklearn.log_model(
                model,
                artifact_path=model_name+"_weight_scaled",
                registered_model_name=model_name+"_weight_scaled",
                signature=signature,
                input_example=input_example
            )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Successfully registered model 'LogisticRegression_weight_scaled'.
2025/10/31 10:56:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_weight_scaled, version 1
Created version '1' of model 'LogisticRegression_weight_scaled'.


🏃 View run LogisticRegression_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/bce76bde999549278c1b7f051d28fee7
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Successfully registered model 'RandomForest_weight_scaled'.
2025/10/31 10:58:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForest_weight_scaled, version 1
Created version '1' of model 'RandomForest_weight_scaled'.


🏃 View run RandomForest_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/feb333e05e5042e5bbf257b822338f22
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Successfully registered model 'XGBoost_weight_scaled'.
2025/10/31 10:58:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost_weight_scaled, version 1
Created version '1' of model 'XGBoost_weight_scaled'.


🏃 View run XGBoost_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/878ea1251235445395dabbb23b35a543
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'NeuralNet_weight_scaled'.
2025/10/31 11:02:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: NeuralNet_weight_scaled, version 1


🏃 View run NeuralNet_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/c82a2915b87743c5878be812d4568ab7
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1
🏃 View run Model_Comparison_SMOTE at: http://127.0.0.1:5001/#/experiments/1/runs/4550a3972c8b4c47bac6ded8a1c31a28
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/1


Created version '1' of model 'NeuralNet_weight_scaled'.
