# **Ensemble powered Loan Predictions**

In [1]:
!pip install nbformat



In [2]:
# Main libraries for data manipulation
import pandas as pd
import numpy as np

# Visualization libraries
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

# Machine learning libraries
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold

# preprocessing and encoding
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

print("Imports complete")

Imports complete


In [3]:
lpdf_train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
lpdf_test  = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
lpdf_sam = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

In [4]:
lpdf_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [5]:
lpdf_train.describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442235,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


In [6]:
lpdf_train.drop(columns=['id'], inplace=True)
lpdf_test.drop(columns=['id'], inplace=True)

In [7]:
lpdf_train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [8]:
lpdf_test.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


In [9]:
lpdf_train.isnull().sum()

annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [10]:
lpdf_test.isnull().sum()

annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
dtype: int64

In [11]:
lpdf_train.education_level.unique()

array(['High School', "Master's", "Bachelor's", 'PhD', 'Other'],
      dtype=object)

## **PreProcessing** 

In [12]:
def prepare_data(train_df, test_df, target, ranked_mappings=None, onehot_cols=None):
    """
    Preprocess train and test DataFrames:
      - Rank-encode ordered columns using provided mappings
      - One-hot encode nominal columns
      - Leave numeric columns as-is
    Returns: X_train, y_train, X_test, y_test (or None), preprocessor
    """

    onehot_cols = onehot_cols or []
    ranked_mappings = ranked_mappings or {}

    # Split train features and target
    X_train = train_df.drop(columns=[target])
    y_train = train_df[target]

    # Test features (check if target exists)
    if target in test_df.columns:
        X_test = test_df.drop(columns=[target])
        y_test = test_df[target]
    else:
        X_test = test_df.copy()
        y_test = None

    # Identify ranked columns explicitly
    ranked_cols = list(ranked_mappings.keys())

    # Identify numeric columns (excluding ranked + onehot)
    numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c not in ranked_cols + onehot_cols]

    # Build the column transformer
    transformers = []

    for col, order in ranked_mappings.items():
        transformers.append(
            (f"ranked_{col}", OrdinalEncoder(categories=[order]), [col])
        )

    if onehot_cols:
        transformers.append(
            ("onehot", OneHotEncoder(drop="first", sparse_output=False), onehot_cols)
        )

    if numeric_cols:
        transformers.append(("numeric", "passthrough", numeric_cols))

    preprocessor = ColumnTransformer(transformers=transformers)

    # Fit on train, transform both
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Build feature names
    feature_names = []
    for col, _ in ranked_mappings.items():
        feature_names.append(col)

    if onehot_cols:
        onehot_names = preprocessor.named_transformers_["onehot"].get_feature_names_out(onehot_cols).tolist()
        feature_names += onehot_names

    if numeric_cols:
        feature_names += numeric_cols

    X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names)
    X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names)

    return X_train_processed, y_train, X_test_processed, y_test, preprocessor


ranked_mappings = {
    "education_level": ["Other", "High School", "Bachelor's", "Master's", "PhD"],
    "grade_subgrade": [
        "A1", "A2", "A3", "A4", "A5",
        "B1", "B2", "B3", "B4", "B5",
        "C1", "C2", "C3", "C4", "C5",
        "D1", "D2", "D3", "D4", "D5",
        "E1", "E2", "E3", "E4", "E5",
        "F1", "F2", "F3", "F4", "F5"
    ]
}

X_train, y_train, X_test, y_test, preprocessor = prepare_data(
    train_df=lpdf_train,
    test_df=lpdf_test,
    target="loan_paid_back",
    ranked_mappings=ranked_mappings,
    onehot_cols=["gender", "loan_purpose", "employment_status"]
)

print("Encoding Complete")

Encoding Complete


In [13]:
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, device='gpu'),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42, task_type='GPU'),
}

# Scoring

In [14]:
model_scores = []

def add_model_report(model_name, y_true, y_pred, y_proba=None, storage=model_scores):
    """
    Takes a model's predictions and optionally probabilities, computes key metrics,
    and appends them to the storage list.
    """
    report = {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred)
    }
    
    if y_proba is not None:
        report["ROC AUC"] = roc_auc_score(y_true, y_proba)
    else:
        report["ROC AUC"] = None

    storage.append(report)

print("Scores Created")

Scores Created


## **Model Testing** 

In [15]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store predictions for the real test set
test_predictions = {name: np.zeros(len(X_test)) for name in models.keys()}

# Loop over models
for name, model_instance in tqdm(models.items(), desc="Training models"):
    scale = name in ["Logistic Regression", "SVM"]
    
    # Arrays to collect out-of-fold (OOF) predictions and true labels
    oof_preds = np.zeros(len(X_train))
    oof_true = np.zeros(len(X_train))
    
    # Stratified K-Fold CV
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Build pipeline
        pipe = Pipeline([
            ("scaler", StandardScaler()) if scale else ("noop", "passthrough"),
            ("model", model_instance)
        ])
        
        # Fit on training fold
        pipe.fit(X_tr, y_tr)
        
        # Predict on validation fold
        if hasattr(pipe, "predict_proba"):
            try:
                y_val_pred = pipe.predict_proba(X_val)[:, 1]
            except:
                y_val_pred = pipe.predict(X_val)
        else:
            y_val_pred = pipe.predict(X_val)
        
        # Save OOF predictions for metrics
        oof_preds[val_idx] = y_val_pred
        oof_true[val_idx] = y_val
        
        # Predict on real test set and accumulate
        if hasattr(pipe, "predict_proba"):
            try:
                y_test_pred = pipe.predict_proba(X_test)[:, 1]
            except:
                y_test_pred = pipe.predict(X_test)
        else:
            y_test_pred = pipe.predict(X_test)
        
        test_predictions[name] += y_test_pred / n_splits  # average across folds
    
    # Compute metrics using OOF predictions
    # If classifier outputs probabilities, threshold at 0.5
    if oof_preds.ndim == 1 or oof_preds.shape[1] == 1:
        y_oof_labels = (oof_preds >= 0.5).astype(int)
    else:
        y_oof_labels = oof_preds  # already labels
    
    add_model_report(name, oof_true, y_oof_labels, y_proba=oof_preds)


Training models:  33%|███▎      | 1/3 [00:12<00:24, 12.46s/it]

[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1335
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 20
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (5.44 MB) transferred to GPU in 0.006909 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798819 -> initscore=1.378932
[LightGBM] [Info] Start training from score 1.378932
[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1335
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 20
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (5.44 MB) transferred to GPU in 0.006336 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798819 -> initscore=1.3789

Training models: 100%|██████████| 3/3 [01:22<00:00, 27.36s/it]


In [16]:
from sklearn.ensemble import VotingClassifier

ensemble_models = {
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42, task_type='GPU'),
    "LightGBM": LGBMClassifier(random_state=42, device='gpu')
}

ensemble = VotingClassifier(estimators=list(ensemble_models.items()), voting='soft')
ensemble.fit(X_train, y_train)

# Predict on test set
ensemble_pred_proba = ensemble.predict_proba(X_test)[:, 1]
ensemble_pred_class = ensemble.predict(X_test)

# Add to scores
add_model_report("Ensemble (CatBoost + LightGBM)", y_train, (ensemble.predict_proba(X_train)[:, 1] >= 0.5).astype(int), y_proba=ensemble.predict_proba(X_train)[:, 1])

print("Ensemble model trained and evaluated.")

[LightGBM] [Info] Number of positive: 474494, number of negative: 119500
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1335
[LightGBM] [Info] Number of data points in the train set: 593994, number of used features: 20
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (6.80 MB) transferred to GPU in 0.013088 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798820 -> initscore=1.378933
[LightGBM] [Info] Start training from score 1.378933
Ensemble model trained and evaluated.


In [17]:
df_scores = pd.DataFrame(model_scores).sort_values(by='F1 Score', ascending=False)
display(df_scores)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC
3,Ensemble (CatBoost + LightGBM),0.905412,0.906352,0.983176,0.943202,0.920645
1,LightGBM,0.904863,0.907177,0.981311,0.942789,0.919599
0,XGBoost,0.904576,0.908388,0.979308,0.942516,0.920577
2,CatBoost,0.903225,0.904976,0.98196,0.941897,0.917406


In [18]:
# Melt the DataFrame to long format for Plotly
df_long = df_scores.melt(id_vars="Model", var_name="Metric", value_name="Score")

# Create grouped bar chart
fig = px.bar(
    df_long,
    x="Model",
    y="Score",
    color="Metric",
    barmode="group",
    text="Score",
    title="Model Performance Comparison",
    height=650
)

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 1]),
    font=dict(
        family="Arial, sans-serif",
        size=14,                   
        color="RebeccaPurple"      
    ),
    plot_bgcolor='lightgray',     
    paper_bgcolor='lightblue',     
    margin=dict(l=50, r=50, t=50, b=50),
    hovermode="x unified",   
    legend=dict(
        orientation="h",     
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)
fig.show()

In [19]:
import joblib

# Save
joblib.dump(ensemble, "ensemble.joblib")

['ensemble.joblib']

# Create submission format

In [20]:
finalModel = joblib.load('ensemble.joblib')

In [21]:
y_class = finalModel.predict(X_test)
y_proba = finalModel.predict_proba(X_test)[:, 1]


# Should have copied the data before mutilating, so we reimport
lpdf_test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

submission_df = pd.DataFrame({
    "id": lpdf_test["id"],
    "loan_paid_back": y_class
})
submission_df_proba = pd.DataFrame({
    "id": lpdf_test["id"],
    "loan_paid_back": y_proba
})


submission_df_proba.to_csv('export_test_predict.csv')