# 04 - Model Selection

## 1. Load Processed Data
We load the cleaned and processed data for modeling.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../data/processed/credit_card_default_processed.csv')
X = df.drop('default_payment', axis=1)
y = df['default_payment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# scaler = StandardScaler()
# numerical_cols = ['LIMIT_BAL', 'Avg_Bill', 'Avg_Payment',
#                   'Total_Bill_vs_Payment_Ratio', 'Utilization_Trend']
# X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

X_train.shape, X_test.shape

((24000, 49), (6000, 49))

In [2]:
class_weights = {0: 1.0, 1: len(y_train[y_train == 0]) / len(y_train[y_train == 1])}

## 2. Baseline Model: Logistic Regression
Train a logistic regression as a baseline and evaluate with cross-validation.


In [9]:
from sklearn.linear_model import LogisticRegression
import sys
sys.path.append('..')
from src.modeling import cross_validate_model, train_model, grid_search

# Increase max_iter and use liblinear solver which is more robust for this dataset
logreg = LogisticRegression(max_iter=5000, solver='liblinear', class_weight=class_weights)
cv_scores_logreg = cross_validate_model(logreg, X_train, y_train, cv=5, scoring='roc_auc')
print('Logistic Regression CV ROC-AUC:', cv_scores_logreg.mean())
logreg = train_model(logreg, X_train, y_train)

Logistic Regression CV ROC-AUC: 0.6724566302611834


## 3. Random Forest Classifier
Try a Random Forest and compare performance.


In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
cv_scores_rf = cross_validate_model(rf, X_train, y_train, cv=5, scoring='roc_auc')
print('Random Forest CV ROC-AUC:', cv_scores_rf.mean())
rf = train_model(rf, X_train, y_train)

Random Forest CV ROC-AUC: 0.7600308167213954


## 4. Hyperparameter Optimization

### Logistic Regression with Grid Search

In [10]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10], 
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear']
}

lr_grid = grid_search(logreg, param_grid_lr, X_train, y_train)

best_model, best_params, best_score = lr_grid

print("Best params:", best_params)
print("Best ROC-AUC:", best_score) 
best_lr = best_model

Best params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best ROC-AUC: 0.7645321239593781


In [11]:
from src.utils import save_model

# Save the best Logistic Regression model
save_model(best_lr, 'best_logistic_regression.joblib')

Model saved to ../outputs/models/best_logistic_regression.joblib


### Random Forest with Grid Search

In [14]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}
rf_grid = grid_search(rf, param_grid_rf, X_train, y_train)

best_model, best_params, best_score = rf_grid

print("Best params:", best_params)
print("Best ROC-AUC:", best_score) 
best_rf = best_model


Best params: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best ROC-AUC: 0.7802273876091081


In [15]:
# Save the best Random Forest model
save_model(best_rf, 'best_random_forest.joblib')

Model saved to ../outputs/models/best_random_forest.joblib


## Decision Tree Classifier with Grid Search

In [7]:
from sklearn.tree import DecisionTreeClassifier
import sys
sys.path.append('..')

from src.modeling import grid_search, train_model
from src.utils import save_model

param_grid_dt = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}
dt = DecisionTreeClassifier(class_weight=class_weights, random_state=42)
dt_grid = grid_search(dt, param_grid_dt, X_train, y_train)
best_model, best_params, best_score = dt_grid
print(f"Best params: {best_params}")
print(f"Best ROC-AUC: {best_score}")
best_dt = train_model(best_model, X_train, y_train)
save_model(best_dt, 'best_decision_tree.joblib')

Best params: {'max_depth': 5, 'min_samples_split': 2}
Best ROC-AUC: 0.7652902350548355
Model saved to ../outputs/models\best_decision_tree.joblib


## K-Nearest Neighbors (KNN) with Grid Search

In [3]:
from sklearn.neighbors import KNeighborsClassifier
import sys
sys.path.append('..')

from src.modeling import grid_search, train_model
from src.utils import save_model

param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
knn = KNeighborsClassifier()
knn_grid = grid_search(knn, param_grid_knn, X_train, y_train)
best_model, best_params, best_score = knn_grid
print(f"Best params: {best_params}")
print(f"Best ROC-AUC: {best_score}")
best_knn = train_model(best_model, X_train, y_train)
save_model(best_knn, 'best_knn.joblib')

Best params: {'n_neighbors': 7, 'weights': 'distance'}
Best ROC-AUC: 0.6191835177858624
Model saved to ../outputs/models\best_knn.joblib
