# Loan Payback Prediction - XGBoost Model

## Goal
Predict the probability that a borrower will pay back their loan using an XGBoost model with Cross-Validation.

## Steps
1.  Data Loading
2.  Feature Engineering
3.  Model Training (Stratified K-Fold CV)
4.  Submission Generation

In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
import warnings

warnings.filterwarnings('ignore')

## 1. Data Loading

In [11]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (593994, 13)
Test shape: (254569, 12)


## 2. Feature Engineering

In [12]:
# Combine for consistent encoding
train_df['is_train'] = 1
test_df['is_train'] = 0
test_df['loan_paid_back'] = np.nan

full_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# 1. Log Transformation for skewed features
skewed_features = ['annual_income', 'loan_amount', 'debt_to_income_ratio']
for col in skewed_features:
    full_df[col] = np.log1p(full_df[col])

# 2. Ordinal Encoding for grade_subgrade
# Create a mapping from A1..G5 to 1..35
grades = sorted(full_df['grade_subgrade'].unique())
grade_map = {grade: i for i, grade in enumerate(grades)}
full_df['grade_subgrade_encoded'] = full_df['grade_subgrade'].map(grade_map)

# 3. One-Hot Encoding for other categoricals
categorical_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']
full_df = pd.get_dummies(full_df, columns=categorical_cols, drop_first=True)

# Drop original grade_subgrade and id
full_df.drop(['grade_subgrade', 'id'], axis=1, inplace=True)

# Split back
train_processed = full_df[full_df['is_train'] == 1].drop(['is_train'], axis=1)
test_processed = full_df[full_df['is_train'] == 0].drop(['is_train', 'loan_paid_back'], axis=1)

X = train_processed.drop('loan_paid_back', axis=1)
y = train_processed['loan_paid_back']
X_test = test_processed

print(f"Processed Train shape: {X.shape}")
print(f"Processed Test shape: {X_test.shape}")

Processed Train shape: (593994, 26)
Processed Test shape: (254569, 26)


## 3. Model Training (Stratified K-Fold CV)

In [13]:
# Calculate scale_pos_weight
num_pos = y.sum()
num_neg = len(y) - num_pos
scale_pos_weight = num_neg / num_pos

print(f"Scale Pos Weight: {scale_pos_weight:.4f}")

FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(X_test.shape[0])
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = xgb.XGBClassifier(
        n_estimators=5000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=4000,
        eval_metric='auc'
    )
    
    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)], 
        verbose=100
    )
    
    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_preds
    
    score = roc_auc_score(y_val, val_preds)
    cv_scores.append(score)
    print(f"Fold {fold+1} ROC AUC: {score:.5f}")
    
    # Predict on test set (average over folds)
    test_preds += model.predict_proba(X_test)[:, 1] / FOLDS

print(f"\nAverage CV ROC AUC: {np.mean(cv_scores):.5f}")
print(f"OOF ROC AUC: {roc_auc_score(y, oof_preds):.5f}")

Scale Pos Weight: 0.2518
[0]	validation_0-auc:0.90792
[100]	validation_0-auc:0.91802
[200]	validation_0-auc:0.91980
[300]	validation_0-auc:0.92081
[400]	validation_0-auc:0.92147
[500]	validation_0-auc:0.92214
[600]	validation_0-auc:0.92251
[700]	validation_0-auc:0.92269
[800]	validation_0-auc:0.92280
[900]	validation_0-auc:0.92274
[1000]	validation_0-auc:0.92270
[1100]	validation_0-auc:0.92264
[1200]	validation_0-auc:0.92259
[1300]	validation_0-auc:0.92248
[1400]	validation_0-auc:0.92240
[1500]	validation_0-auc:0.92222
[1600]	validation_0-auc:0.92207
[1700]	validation_0-auc:0.92197
[1800]	validation_0-auc:0.92187
[1900]	validation_0-auc:0.92176
[2000]	validation_0-auc:0.92161
[2100]	validation_0-auc:0.92149
[2200]	validation_0-auc:0.92136
[2300]	validation_0-auc:0.92114
[2400]	validation_0-auc:0.92104
[2500]	validation_0-auc:0.92090
[2600]	validation_0-auc:0.92073
[2700]	validation_0-auc:0.92055
[2800]	validation_0-auc:0.92043
[2900]	validation_0-auc:0.92022
[3000]	validation_0-auc:0.9

* Average CV ROC AUC: 0.92155
* OOF ROC AUC: 0.92155

## 4. Submission Generation

In [14]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_paid_back': test_preds
})

submission.to_csv('submission_xgboost_1.csv', index=False)
print("Submission saved to submission.csv")
submission.head()

Submission saved to submission.csv


Unnamed: 0,id,loan_paid_back
0,593994,0.787539
1,593995,0.920654
2,593996,0.206161
3,593997,0.71964
4,593998,0.90303
