In [19]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [20]:
TEST_SIZE = 0.2
RANDOM_STATE = 42
MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'
EXPERIMENT_NAME = 'Loan-Defaulters-Models-Kfold'
KFOLD_SPLITS = 10

MODELS = {
    'RandomForest': {
        'model': RandomForestClassifier,
        'params': {
            'n_estimators': 150,
            'class_weight': 'balanced',
            'random_state': RANDOM_STATE,
            'bootstrap': False
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression,
        'params': {
            'solver': 'liblinear',
            'class_weight': 'balanced',
            'random_state': RANDOM_STATE
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier,
        'params': {
            'n_estimators': 100,
            'learning_rate': 0.1,
            'max_depth': 3,
            'random_state': RANDOM_STATE
        }
    },
    'XGBoost': {
        'model': XGBClassifier,
        'params': {
            'n_estimators': 100,
            'learning_rate': 0.1,
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'random_state': RANDOM_STATE
        }
    },
    'SVC': {
        'model': SVC,
        'params': {
            'probability': True,
            'kernel': 'rbf',
            'C': 1.0,
            'class_weight': 'balanced',
            'random_state': RANDOM_STATE
        }
    }
}

In [21]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/608014518094895647', creation_time=1749380274966, experiment_id='608014518094895647', last_update_time=1749380274966, lifecycle_stage='active', name='Loan-Defaulters-Models-Kfold', tags={}>

In [22]:
data = pd.read_csv('../training_data/balanced_training_data.csv')
y = data['loan_status']
X = data.drop('loan_status', axis=1)

In [23]:
skf = StratifiedKFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_STATE)

In [None]:
for model_name, config in MODELS.items():
    print(model_name)
    with mlflow.start_run(run_name=model_name):
        ModelClass = config['model']
        params = config['params']

        y_true_all = []
        y_pred_all = []
        y_proba_all = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            print(f'\tFold: {fold}')
            X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

            model = ModelClass(**params)
            model.fit(X_train_fold, y_train_fold)

            y_pred = model.predict(X_val_fold)
            y_true_all.extend(y_val_fold)
            y_pred_all.extend(y_pred)

            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_val_fold)[:, 1]
                y_proba_all.extend(y_proba)
            else:
                y_proba_all = None

        report_dict = classification_report(y_true_all, y_pred_all, output_dict=True, zero_division=0)

        mlflow.log_params(params)
        metrics = {
            'accuracy': report_dict['accuracy'],
            'recall_class_0': report_dict['0']['recall'],
            'recall_class_1': report_dict['1']['recall'],
            'precision_class_0': report_dict['0']['precision'],
            'precision_class_1': report_dict['1']['precision'],
            'f1_score_macro': report_dict['macro avg']['f1-score']
        }

        # F2 Score (approx, using macro avg)
        p = report_dict['macro avg']['precision']
        r = report_dict['macro avg']['recall']
        f2_score = (5 * p * r) / (4 * p + r) if (4 * p + r) > 0 else 0
        metrics['f2_score_macro'] = f2_score

        mlflow.log_metrics(metrics)

        # Fit final model on full data to log
        final_model = ModelClass(**params)
        final_model.fit(X, y)
        mlflow.sklearn.log_model(final_model, artifact_path=f"{model_name}_model")

RandomForest
	Fold: 0
