# Loan Payback Prediction - Ensemble Model (XGBoost + Logistic Regression)

## Goal
Predict the probability that a borrower will pay back their loan using an ensemble of XGBoost and Logistic Regression models. We will perform hyperparameter tuning for both models and combine them using a Voting Classifier.

## Steps
1.  Data Loading
2.  Feature Engineering
3.  Hyperparameter Tuning (RandomizedSearchCV)
4.  Ensemble Modeling (VotingClassifier)
5.  Model Evaluation (Stratified K-Fold CV)
6.  Submission Generation

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import warnings

warnings.filterwarnings('ignore')

## 1. Data Loading

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (593994, 13)
Test shape: (254569, 12)


## 2. Feature Engineering

In [3]:
# Combine for consistent encoding
train_df['is_train'] = 1
test_df['is_train'] = 0
test_df['loan_paid_back'] = np.nan

full_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# 1. Log Transformation for skewed features
skewed_features = ['annual_income', 'loan_amount', 'debt_to_income_ratio']
for col in skewed_features:
    full_df[col] = np.log1p(full_df[col])

# 2. Ordinal Encoding for grade_subgrade
grades = sorted(full_df['grade_subgrade'].unique())
grade_map = {grade: i for i, grade in enumerate(grades)}
full_df['grade_subgrade_encoded'] = full_df['grade_subgrade'].map(grade_map)

# 3. One-Hot Encoding for other categoricals
categorical_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']
full_df = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True)

# Drop original grade_subgrade and id
full_df.drop(['grade_subgrade', 'id'], axis=1, inplace=True)

# Split back
train_processed = full_df[full_df['is_train'] == 1].drop(['is_train'], axis=1)
test_processed = full_df[full_df['is_train'] == 0].drop(['is_train', 'loan_paid_back'], axis=1)

X = train_processed.drop('loan_paid_back', axis=1)
y = train_processed['loan_paid_back']
X_test = test_processed

print(f"Processed Train shape: {X.shape}")
print(f"Processed Test shape: {X_test.shape}")

Processed Train shape: (593994, 26)
Processed Test shape: (254569, 26)


## 3. Hyperparameter Tuning

In [4]:
# Calculate scale_pos_weight for XGBoost
num_pos = y.sum()
num_neg = len(y) - num_pos
scale_pos_weight = num_neg / num_pos

# --- XGBoost Tuning ---
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    n_jobs=-1,
    random_state=42,
    tree_method='hist' # Faster training
)

xgb_params = {
    'n_estimators': [1000, 2000, 3000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'scale_pos_weight': [1, scale_pos_weight]
}

# Using RandomizedSearchCV for speed
xgb_search = RandomizedSearchCV(
    xgb_clf,
    param_distributions=xgb_params,
    n_iter=5, # Limited iterations for demonstration speed
    scoring='roc_auc',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

print("Tuning XGBoost...")
xgb_search.fit(X, y)
print(f"Best XGBoost Params: {xgb_search.best_params_}")
print(f"Best XGBoost Score: {xgb_search.best_score_}")
best_xgb = xgb_search.best_estimator_

Tuning XGBoost...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best XGBoost Params: {'subsample': 0.8, 'scale_pos_weight': np.float64(0.25184723094496453), 'n_estimators': 3000, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.9}
Best XGBoost Score: 0.9215151916463705


In [5]:
# --- Logistic Regression Tuning ---
# Logistic Regression needs Scaling and Imputation
lr_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(solver='saga', max_iter=1000, random_state=42))
])

lr_params = {
    'lr__C': [0.01, 0.1, 1, 10],
    'lr__penalty': ['l1', 'l2']
}

lr_search = RandomizedSearchCV(
    lr_pipeline,
    param_distributions=lr_params,
    n_iter=5,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

print("\nTuning Logistic Regression...")
lr_search.fit(X, y)
print(f"Best LR Params: {lr_search.best_params_}")
print(f"Best LR Score: {lr_search.best_score_}")
best_lr = lr_search.best_estimator_


Tuning Logistic Regression...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best LR Params: {'lr__penalty': 'l1', 'lr__C': 0.01}
Best LR Score: 0.9102459889557615


## 4. Ensemble Modeling (VotingClassifier)

In [6]:
ensemble_clf = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('lr', best_lr)
    ],
    voting='soft',
    n_jobs=-1
)

## 5. Model Evaluation (Stratified K-Fold CV)

In [7]:
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

cv_scores = cross_val_score(ensemble_clf, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

print(f"Ensemble CV ROC AUC Scores: {cv_scores}")
print(f"Average Ensemble CV ROC AUC: {np.mean(cv_scores):.5f} +/- {np.std(cv_scores):.5f}")

Ensemble CV ROC AUC Scores: [0.92206032 0.92164786 0.9196514  0.92046015 0.92034931]
Average Ensemble CV ROC AUC: 0.92083 +/- 0.00089


## 6. Submission Generation

In [8]:
# Fit on full training data
print("Retraining ensemble on full data...")
ensemble_clf.fit(X, y)

# Predict on test set
test_preds = ensemble_clf.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_paid_back': test_preds
})

submission.to_csv('submission_ensemble.csv', index=False)
print("Submission saved to submission_ensemble.csv")
submission.head()

Retraining ensemble on full data...
Submission saved to submission_ensemble.csv


Unnamed: 0,id,loan_paid_back
0,593994,0.861872
1,593995,0.935717
2,593996,0.165388
3,593997,0.817991
4,593998,0.91452
