## 📦 Cell 1 — Import & Load Data

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load processed data
df = pd.read_parquet("../../data/processed/processed_data.parquet")

# Separate features and target
X = df.drop(columns=["is_high_risk"])
y = df["is_high_risk"]

print("Data shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

Data shape: (76529, 24)
Target distribution:
 is_high_risk
0    0.941787
1    0.058213
Name: proportion, dtype: float64


## 📦 Cell 2 — Train-Test Split

In [20]:
# Features and target
X = df.drop(columns=["is_high_risk"])
y = df["is_high_risk"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 📦 Cell 3 — Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

# Simple baseline model
lr = LogisticRegression(
    random_state=42, 
    class_weight="balanced", 
    max_iter=500
)

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]


## 📦 Cell 4 — Random Forest with Hyperparameter Search

In [25]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)
import numpy as np
import mlflow
import mlflow.sklearn

# ---------------------------------------------------
# Drop leakage columns if present
# ---------------------------------------------------
cols_to_drop = ["FraudResult"]

for col in cols_to_drop:
    if col in X_train.columns:
        X_train = X_train.drop(columns=[col])
    if col in X_val.columns:
        X_val = X_val.drop(columns=[col])

# ---------------------------------------------------
# SMOTE
# ---------------------------------------------------
print("Class distribution BEFORE SMOTE:")
print(y_train.value_counts(normalize=True))

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Class distribution AFTER SMOTE:")
print(y_train_smote.value_counts(normalize=True))

# ---------------------------------------------------
# Random Forest Hyperparameter Tuning
# ---------------------------------------------------
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
}

rf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

search_rf = RandomizedSearchCV(
    rf,
    param_distributions=param_grid,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

search_rf.fit(X_train_smote, y_train_smote)
best_rf = search_rf.best_estimator_

print("Best RF params:", search_rf.best_params_)

# ---------------------------------------------------
# Ensure columns match in validation set
# ---------------------------------------------------
missing_cols = set(X_train_smote.columns) - set(X_val.columns)
for col in missing_cols:
    X_val[col] = 0

# Reorder columns to match
X_val = X_val[X_train_smote.columns]

# ---------------------------------------------------
# Predict probabilities on Validation Set
# ---------------------------------------------------
y_prob_rf = best_rf.predict_proba(X_val)[:, 1]

# ---------------------------------------------------
# Threshold Tuning
# ---------------------------------------------------
thresholds = np.arange(0.1, 0.9, 0.05)
best_thresh = 0.5
best_f1 = 0

for t in thresholds:
    preds = (y_prob_rf >= t).astype(int)
    f1 = f1_score(y_val, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nOptimal Threshold found: {best_thresh:.2f}")
print(f"Best F1 score at optimal threshold: {best_f1:.4f}")

# ---------------------------------------------------
# Final predictions and metrics
# ---------------------------------------------------
y_pred_rf = (y_prob_rf >= best_thresh).astype(int)

metrics_rf = {
    "accuracy": accuracy_score(y_val, y_pred_rf),
    "precision": precision_score(y_val, y_pred_rf, zero_division=0),
    "recall": recall_score(y_val, y_pred_rf, zero_division=0),
    "f1": f1_score(y_val, y_pred_rf, zero_division=0),
    "roc_auc": roc_auc_score(y_val, y_prob_rf),
}

print("\nRandom Forest Metrics after SMOTE + Threshold Tuning:")
for k, v in metrics_rf.items():
    print(f"{k}: {v:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_rf, digits=4))

# ---------------------------------------------------
# MLflow Logging
# ---------------------------------------------------
mlflow.set_experiment("credit-risk-modeling")

with mlflow.start_run(run_name="RandomForest_SMOTE_ThresholdTuned") as run:
    mlflow.log_params(search_rf.best_params_)
    mlflow.log_metrics(metrics_rf)
    mlflow.sklearn.log_model(best_rf, "model")
    mlflow.log_param("optimal_threshold", best_thresh)
    print("Logged Random Forest model to MLflow run", run.info.run_id)


Class distribution BEFORE SMOTE:
is_high_risk
0    0.941787
1    0.058213
Name: proportion, dtype: float64
Class distribution AFTER SMOTE:
is_high_risk
0    0.5
1    0.5
Name: proportion, dtype: float64
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=  14.5s
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=  14.8s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  15.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  15.3s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=  16.1s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=  10.0s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=  10.2s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=  10.9s
[CV] END max_depth=10, min_samples_split=5, n_estimators=300; total time=  29.8s
[

## 📦 Cell 5 — Evaluation Function

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

def evaluate_model(y_true, y_pred, y_prob):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_prob),
    }
    return metrics

lr_metrics = evaluate_model(y_test, y_pred_lr, y_prob_lr)
rf_metrics = evaluate_model(y_test, y_pred_rf, y_prob_rf)

print("Logistic Regression metrics:\n", lr_metrics)
print("Random Forest metrics:\n", rf_metrics)


Logistic Regression metrics:
 {'accuracy': 0.5715405723245786, 'precision': 0.11776608660461352, 'recall': 0.9797979797979798, 'f1': 0.21026011560693642, 'roc_auc': 0.7787303800715756}
Random Forest metrics:
 {'accuracy': 0.9315301189076179, 'precision': 0.3794162826420891, 'recall': 0.2772166105499439, 'f1': 0.3203631647211414, 'roc_auc': 0.9151988532957431}


## 📦 Cell 6 — Track in MLflow

In [None]:
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("../../mlruns")
mlflow.set_experiment("credit-risk-modeling")

with mlflow.start_run(run_name="RandomForest") as run:
    mlflow.sklearn.log_model(best_rf, "model")
    mlflow.log_params(search.best_params_)
    mlflow.log_metrics(rf_metrics)
    
    run_id = run.info.run_id

model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri, "CreditRiskModel")

print("Model registered in MLflow Model Registry!")


Registered model 'CreditRiskModel' already exists. Creating a new version of this model...
2025/06/29 13:05:36 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: CreditRiskModel, version 2


Model registered in MLflow Model Registry!


Created version '2' of model 'CreditRiskModel'.


In [24]:
print("Class distribution BEFORE SMOTE:")
print(y_train.value_counts(normalize=True))

# ------------------------------------------------------------
# Apply SMOTE
# ------------------------------------------------------------

# Instantiate SMOTE
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution AFTER SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

# ------------------------------------------------------------
# Train GBM with hyperparameter tuning
# ------------------------------------------------------------

param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "subsample": [0.8],
}

gbm = GradientBoostingClassifier(random_state=42)

search_gbm = GridSearchCV(
    gbm,
    param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    verbose=2,
)

search_gbm.fit(X_train_resampled, y_train_resampled)

best_gbm = search_gbm.best_estimator_

print("Best parameters for GBM:")
print(search_gbm.best_params_)

# ------------------------------------------------------------
# Evaluate on validation set
# ------------------------------------------------------------

y_pred = best_gbm.predict(X_val)
y_proba = best_gbm.predict_proba(X_val)[:, 1]

metrics_gbm = {
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred, zero_division=0),
    "recall": recall_score(y_val, y_pred, zero_division=0),
    "f1": f1_score(y_val, y_pred, zero_division=0),
    "roc_auc": roc_auc_score(y_val, y_proba),
}

print("\nGradient Boosting Validation Metrics after SMOTE:")
for metric, value in metrics_gbm.items():
    print(f"{metric}: {value:.4f}")

# ------------------------------------------------------------
# Log to MLflow
# ------------------------------------------------------------

# Ensure MLflow tracking directory exists
import os

tracking_dir = "../../mlruns"
if not os.path.exists(tracking_dir):
    os.makedirs(tracking_dir)

mlflow.set_tracking_uri(f"file:{tracking_dir}")
mlflow.set_experiment("credit-risk-modeling")

with mlflow.start_run(run_name="GBM_with_SMOTE") as run:
    mlflow.log_params(search_gbm.best_params_)
    mlflow.log_metrics(metrics_gbm)
    mlflow.sklearn.log_model(best_gbm, "model")
    
    run_id = run.info.run_id
    print(f"Logged GBM model to MLflow run {run_id}")



Class distribution BEFORE SMOTE:
is_high_risk
0    0.941787
1    0.058213
Name: proportion, dtype: float64
Class distribution AFTER SMOTE:
is_high_risk
0    0.5
1    0.5
Name: proportion, dtype: float64
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=  22.3s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=  22.6s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  22.5s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  22.7s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=  23.1s
[CV] END learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time=  22.8s
[CV] END learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time=  33.8s
[CV] END learning_rate=0.05, max_depth=5, n_estimators=100, subsample=0.8; total ti

In [27]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)
import numpy as np
import mlflow
import mlflow.xgboost

# ---------------------------------------------------
# Drop leakage columns if present
# ---------------------------------------------------
cols_to_drop = ["FraudResult"]

for col in cols_to_drop:
    if col in X_train.columns:
        X_train = X_train.drop(columns=[col])
    if col in X_val.columns:
        X_val = X_val.drop(columns=[col])

# ---------------------------------------------------
# SMOTE
# ---------------------------------------------------
print("Class distribution BEFORE SMOTE:")
print(y_train.value_counts(normalize=True))

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Class distribution AFTER SMOTE:")
print(y_train_smote.value_counts(normalize=True))

# ---------------------------------------------------
# XGBoost Hyperparameter Tuning
# ---------------------------------------------------
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "scale_pos_weight": [1, 2, 5],  # handle imbalance
}

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42
)

search_xgb = RandomizedSearchCV(
    xgb,
    param_distributions=param_grid,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

search_xgb.fit(X_train_smote, y_train_smote)
best_xgb = search_xgb.best_estimator_

print("Best XGB params:", search_xgb.best_params_)

# ---------------------------------------------------
# Ensure columns match in validation set
# ---------------------------------------------------
missing_cols = set(X_train_smote.columns) - set(X_val.columns)
for col in missing_cols:
    X_val[col] = 0

# Reorder columns to match
X_val = X_val[X_train_smote.columns]

# ---------------------------------------------------
# Predict probabilities on Validation Set
# ---------------------------------------------------
y_prob_xgb = best_xgb.predict_proba(X_val)[:, 1]

# ---------------------------------------------------
# Threshold Tuning
# ---------------------------------------------------
thresholds = np.arange(0.1, 0.9, 0.05)
best_thresh = 0.5
best_f1 = 0

for t in thresholds:
    preds = (y_prob_xgb >= t).astype(int)
    f1 = f1_score(y_val, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nOptimal Threshold found: {best_thresh:.2f}")
print(f"Best F1 score at optimal threshold: {best_f1:.4f}")

# ---------------------------------------------------
# Final predictions and metrics
# ---------------------------------------------------
y_pred_xgb = (y_prob_xgb >= best_thresh).astype(int)

metrics_xgb = {
    "accuracy": accuracy_score(y_val, y_pred_xgb),
    "precision": precision_score(y_val, y_pred_xgb, zero_division=0),
    "recall": recall_score(y_val, y_pred_xgb, zero_division=0),
    "f1": f1_score(y_val, y_pred_xgb, zero_division=0),
    "roc_auc": roc_auc_score(y_val, y_prob_xgb),
}

print("\nXGBoost Metrics after SMOTE + Threshold Tuning:")
for k, v in metrics_xgb.items():
    print(f"{k}: {v:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_pred_xgb, digits=4))

# ---------------------------------------------------
# MLflow Logging
# ---------------------------------------------------
mlflow.set_experiment("credit-risk-modeling")

with mlflow.start_run(run_name="XGBoost_SMOTE_ThresholdTuned") as run:
    mlflow.log_params(search_xgb.best_params_)
    mlflow.log_metrics(metrics_xgb)
    mlflow.xgboost.log_model(best_xgb, "model")
    mlflow.log_param("optimal_threshold", best_thresh)
    print("Logged XGBoost model to MLflow run", run.info.run_id)


Class distribution BEFORE SMOTE:
is_high_risk
0    0.941787
1    0.058213
Name: proportion, dtype: float64
Class distribution AFTER SMOTE:
is_high_risk
0    0.5
1    0.5
Name: proportion, dtype: float64
Fitting 3 folds for each of 10 candidates, totalling 30 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, scale_pos_weight=2, subsample=0.8; total time=   1.5s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, scale_pos_weight=2, subsample=0.8; total time=   1.6s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, scale_pos_weight=2, subsample=0.8; total time=   1.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   1.8s
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   1.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=6, n_estimators=100, scale_pos_weight=1, subsample=1.0; total time=   2.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.6; total time=   1.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time=   3.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time=   3.4s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, n_estimators=100, scale_pos_weight=1, subsample=0.6; total time=   1.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=10, n_estimators=100, scale_pos_weight=2, subsample=1.0; total time=   3.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, scale_pos_weight=2, subsample=1.0; total time=   3.8s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, scale_pos_weight=2, subsample=1.0; total time=   3.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=500, scale_pos_weight=2, subsample=1.0; total time=   4.3s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=500, scale_pos_weight=1, subsample=0.6; total time=   3.7s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=500, scale_pos_weight=1, subsample=0.6; total time=   3.7s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=500, scale_pos_weight=1, subsample=0.6; total time=   4.0s
[CV] END colsample_bytree=0.6, learning_rate=0.05, max_depth=6, n_estimators=500, scale_pos_weight=5, subsample=0.8; total time=   5.6s
[CV] END colsample_bytree=0.6, learning_rate=0.0