# 04 - Model Selection

## 1. Load Processed Data
We load the cleaned and processed data for modeling.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/processed/credit_card_default_processed.csv')
X = df.drop('default_payment', axis=1)
y = df['default_payment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# scaler = StandardScaler()
# numerical_cols = [
#     'LIMIT_BAL',
#     'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
#     'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 
#     'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
#     'AGE'
# ]
# X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

X_train.shape, X_test.shape

((24000, 56), (6000, 56))

In [2]:
class_weights = {0: 1.0, 1: len(y_train[y_train == 0]) / len(y_train[y_train == 1])}

## 2. Baseline Model: Logistic Regression
Train a logistic regression as a baseline and evaluate with cross-validation.


In [3]:
from sklearn.linear_model import LogisticRegression
import sys
sys.path.append('..')
from src.modeling import cross_validate_model, train_model, grid_search

# Increase max_iter and use liblinear solver which is more robust for this dataset
logreg = LogisticRegression(max_iter=5000, solver='liblinear', class_weight=class_weights)
cv_scores_logreg = cross_validate_model(logreg, X_train, y_train, cv=5, scoring='roc_auc')
print('Logistic Regression CV ROC-AUC:', cv_scores_logreg.mean())
logreg = train_model(logreg, X_train, y_train)

Logistic Regression CV ROC-AUC: 0.6441249144239286


## 3. Random Forest Classifier
Try a Random Forest and compare performance.


In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
cv_scores_rf = cross_validate_model(rf, X_train, y_train, cv=5, scoring='roc_auc')
print('Random Forest CV ROC-AUC:', cv_scores_rf.mean())
rf = train_model(rf, X_train, y_train)

Random Forest CV ROC-AUC: 0.7609361706270178


## 4. Hyperparameter Optimization

### Logistic Regression with Grid Search

In [None]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10], 
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear']
}

lr_grid = grid_search(logreg, param_grid_lr, X_train, y_train)

best_model, best_params, best_score = lr_grid

print("Best params:", best_params)
print("Best ROC-AUC:", best_score) 
best_lr = best_model

In [75]:
from src.utils import save_model

# Save the best Logistic Regression model
save_model(best_lr, 'best_logistic_regression.joblib')

Model saved to ../outputs/models/best_logistic_regression.joblib


### Random Forest with Grid Search

In [8]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}
rf_grid = grid_search(rf, param_grid_rf, X_train, y_train)

best_model, best_params, best_score = rf_grid

print("Best params:", best_params)
print("Best ROC-AUC:", best_score) 
best_rf = best_model


Best params: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Best ROC-AUC: 0.7797540404941591


In [9]:
# Save the best Random Forest model
save_model(best_rf, 'best_random_forest.joblib')

NameError: name 'save_model' is not defined