## Predicting Loan Payback: extended EDA and Optuna tuned XGBoost ML model

**Dataset Description**

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Loan Prediction dataset. Feature distributions are close to, but not exactly the same, as the original.

**Files**

- `train.csv` - the training dataset; `loan_paid_back` is the binary target ground truth
  
- `test.csv` - the test dataset; your objective is to predict a probability for the `loan_paid_back` for each row

- `sample_submission.csv` - a sample submission file in the correct format

**Evaluation**

Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.

### Import libraries

In [None]:
import os
import math
import shap
import optuna
import scipy

import numpy as np
import pandas as pd

from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import chi2_contingency

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install --upgrade seaborn

import seaborn as sns
sns.set(color_codes=True)

### Supplementary functions

#### visualizations

In [None]:
def num_var_distribution_float(df,
                               title: str,
                               x1: str,
                               y1: str,
                               x1_label: str,
                               y1_label: str,
                               x2_label: str,
                               y2_label: str):
    
    figure, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (16, 6))
    figure.suptitle(title,
                    x = 0.5, y = 0.95, fontsize = 16, fontweight ='bold')

    # Figure 1: box-plot
    dir_order = ['train', 'test']
    my_pal = {'train': 'orange', 'test': 'royalblue'}
    box_plot = sns.boxplot(data = df, 
                           x = x1, y = y1,
                           order = dir_order,
                           palette = my_pal,
                           ax = axes[0])
    axes[0].set_xlabel(x1_label, fontsize = 14, fontweight ='bold')
    axes[0].set_ylabel(y1_label, fontsize = 14, fontweight ='bold')
    axes[0].set_xticklabels(labels = dir_order, rotation = 0, ha = 'center', size = 12)

    medians = df.groupby([x1]).agg(
      Med = (y1, np.median)
    ).reset_index()
    medians['Med'] = medians['Med'].round(2)
    medians['Tick'] = range(len(medians))
    
    medians['Cat'] = 0
    for i in range(len(medians)):
        if medians.loc[i, x1] == 'train':
            medians.loc[i, 'Cat'] = 0
        if medians.loc[i, x1] == 'test':
            medians.loc[i, 'Cat'] = 1
    
    medians = medians.sort_values(['Cat'])
    ticks = list(medians['Tick'])
    medians = list(medians['Med'])
    vertical_offset = [median * 0.025 for median in medians]
    
    for xtick in ticks:
        box_plot.text(xtick, medians[xtick] + vertical_offset[xtick], medians[xtick], 
                      horizontalalignment = 'center', 
                      size = 10, 
                      color = 'black', 
                      weight = 'semibold')
    
    
    # Figure 2: distplots
    kde_1 = sns.distplot(a = df.loc[df[x1] == 'train', y1],
                         kde_kws = {'color': 'orange', 'lw': 2.0, 'linestyle': '--'},
                         hist = False,
                         label = 'train',
                         ax = axes[1])
    kde_2 = sns.distplot(a = df.loc[df[x1] == 'test', y1],
                         kde_kws = {'color': 'royalblue', 'lw': 2.0, 'linestyle': '--'},
                         hist = False,
                         label = 'test',
                         ax = axes[1])
    
    axes[1].set_xlabel(x2_label, fontsize = 14, fontweight ='bold')
    axes[1].set_ylabel(y2_label, fontsize = 14, fontweight ='bold')
    
    
    plt.plot()

In [None]:
def corr_plot(df_1, df_2, title):

    figure, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (16, 6))
    figure.suptitle(title,
                    x = 0.5, y = 0.95, fontsize = 18, fontweight ='bold')
     
    sns.heatmap(df_1, 
                annot = True, 
                vmin = -1, 
                vmax = 1, 
                center = 0, 
                cmap = 'coolwarm',
                linewidths = 3, 
                linecolor = 'black',
                ax = ax[0])
    
    sns.heatmap(df_2, 
                annot = True, 
                vmin = -1, 
                vmax = 1, 
                center = 0, 
                cmap = 'coolwarm',
                xticklabels = True,
                yticklabels = False,
                linewidths = 3, 
                linecolor = 'black',
                ax = ax[1])
    
    ax[0].set_title("train", fontsize = 16)
    ax[1].set_title("test", fontsize = 16)
    
    plt.show()

#### credit score additional features

In [None]:
def map_fico_tier(score):
    """Maps a credit score to its corresponding FICO tier."""
    if score >= 800:
        return 'Exceptional'
    elif score >= 740:
        return 'Very Good'
    elif score >= 670:
        return 'Good'
    elif score >= 580:
        return 'Fair'
    else: # Below 580
        return 'Poor'

def map_vantage_tier(score):
    """Maps a credit score to its corresponding VantageScore tier."""
    if score >= 781:
        return 'Excellent'
    elif score >= 661:
        return 'Good'
    elif score >= 601:
        return 'Fair'
    elif score >= 500:
        return 'Poor'
    else: # Below 500
        return 'Very Poor'

### Data downloading

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
df_orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')

### Data concatenating

In [None]:
df_1 = df_train.drop(columns = ['loan_paid_back'])
df_1['data_type'] = 'train'

df_2 = df_test.copy()
df_2['data_type'] = 'test'

df = pd.concat([df_1, df_2], ignore_index = True)

### The very first glance on the data

In [None]:
df_train.info()

In [None]:
df_train.head(3)

In [None]:
df_train_no_id = df_train.drop(columns = ['id'])
df_train_no_id.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)

print('Number of duplicates in the df_train: ', len(df_train) - len(df_train_no_id))

In [None]:
df_test.info()

In [None]:
df_test.head(3)

**Conclusions**:

- The data set includes 11 predictors: 5 numerical and 6 categorical types;
  - *numerical*: `annual_income`, `debt_to_income_ratio`, `credit_score`, `loan_amount`, `interest_rate`;
  - *categorical*: `gender`, `marital_status`, `education_level`, `employment_status`, `loan_purpose`, `grade_subgrade`.
- The training data includes almost 600 000 samples which is enough for further modelling purposes.
- There are no missing values in both training and testing data sets.
- There are no duplicates in the training data set.

### Target variable: `loan_paid_back`

In [None]:
figure, axes = plt.subplots(nrows = 1, ncols = 1, figsize = (6, 4))

count_plot = sns.countplot(df_train, x = "loan_paid_back", stat = "percent")
count_plot.bar_label(count_plot.containers[0], fontsize=10)

axes.set_title('Loan Paid Back Distribution', fontsize = 12, fontweight = 'bold')
axes.set_xlabel('Loan Paid Back', fontsize = 10, fontweight ='bold')
axes.set_ylabel('Percent', fontsize = 10, fontweight ='bold')

plt.show()

In [None]:
df_train['loan_paid_back'].value_counts()

**Conclusions:**

- training data set is weekly imbalanced: 80% of samples belong to the class 1, and 20% - to the class 0.
- possitive samples dominate over the negative ones. 

### Numerical independent variables: distributions and statistics

In [None]:
num_features = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']

In [None]:
df_train[num_features].describe()

In [None]:
df_test[num_features].describe()

In [None]:
num_var_distribution_float(df = df,
                           title = 'Annual Income Distributions by type of data',
                           x1 = 'data_type',
                           y1 = 'annual_income',
                           x1_label = 'Data Type',
                           y1_label = 'Annual Income',
                           x2_label = 'Annual Income',
                           y2_label = 'Density')

In [None]:
num_var_distribution_float(df = df,
                           title = 'Debt to Income Ratio Distributions by type of data',
                           x1 = 'data_type',
                           y1 = 'debt_to_income_ratio',
                           x1_label = 'Data Type',
                           y1_label = 'Debt to Income ratio',
                           x2_label = 'Debt to Income Ratio',
                           y2_label = 'Density')

In [None]:
num_var_distribution_float(df = df,
                           title = 'Credit Score Distributions by type of data',
                           x1 = 'data_type',
                           y1 = 'credit_score',
                           x1_label = 'Data Type',
                           y1_label = 'Credit Score',
                           x2_label = 'Credit Score',
                           y2_label = 'Density')

In [None]:
num_var_distribution_float(df = df,
                           title = 'Loan Amount Distributions by type of data',
                           x1 = 'data_type',
                           y1 = 'loan_amount',
                           x1_label = 'Data Type',
                           y1_label = 'Loan Amount',
                           x2_label = 'Loan Amount',
                           y2_label = 'Density')

In [None]:
num_var_distribution_float(df = df,
                           title = 'Interest Rate Distributions by type of data',
                           x1 = 'data_type',
                           y1 = 'interest_rate',
                           x1_label = 'Data Type',
                           y1_label = 'Interest Rate',
                           x2_label = 'Interest Rate',
                           y2_label = 'Density')

In [None]:
corr_plot(df_1 = df_train[num_features].corr(method = 'spearman'), 
          df_2 = df_test[num_features].corr(method = 'spearman'), 
          title = "Spearman's rank correlation")

**Conclusions**:

- Distributions of numerical variables in both training and testing data sets are almost identical.
- There are no any significant correlations between numerical variables except the moderate inverse correlation between `credit_score` and `interest_rate`.

### Categorical variables: distributions and statistics

In [None]:
categorical_features = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']

In [None]:
df_train[categorical_features].describe()

In [None]:
df_test[categorical_features].describe()

In [None]:
df_cramers_train = pd.DataFrame(columns = categorical_features, 
                                index = categorical_features, 
                                dtype = np.float32)
df_cramers_test = pd.DataFrame(columns = categorical_features, 
                               index = categorical_features,
                               dtype = np.float32)
for i in range(len(categorical_features)):
    var_i = categorical_features[i]
    for j in range(len(categorical_features)):
        var_j = categorical_features[j]
        
        df_temp_train = pd.crosstab(df_train[var_i], df_train[var_j])
        chi2_train, _, _, _ = chi2_contingency(df_temp_train)
        df_cramers_train.loc[var_i, var_j] = math.sqrt(chi2_train / (df_temp_train.values.sum() * min(df_temp_train.shape[0]-1, df_temp_train.shape[1]-1)))
        
        df_temp_test = pd.crosstab(df_test[var_i], df_test[var_j])
        chi2_test, _, _, _ = chi2_contingency(df_temp_test)
        df_cramers_test.loc[var_i, var_j] = math.sqrt(chi2_test / (df_temp_test.values.sum() * min(df_temp_test.shape[0]-1, df_temp_test.shape[1]-1)))

In [None]:
corr_plot(df_1 = df_cramers_train,
          df_2 = df_cramers_test,
          title = "Cramers' V correlation coefficients")

**Conclusions**:

- All categorical features (except `grade_subgrade`) have low cardinality: from 3 to 8.
- Categorical features are similarly distributed in training and testing data sets.
- There are no any associations between different categorical features.

### Data Preprocessing and Feature Engineering

#### new feature: `loan_ammount` / `annual_income` 

In [None]:
for data in [df_train, df_test]:
    data['loan_to_income_ratio'] = data['loan_amount'] / data['annual_income']

#### new features: `grade_subgrade` splitting 

In [None]:
for data in [df_train, df_test]:
    data['grade'] = data['grade_subgrade'].apply(lambda x: x[0])
    data['subgrade'] = data['grade_subgrade'].apply(lambda x: x[1])

#### new features: based on FICO and VantageScore ranges

In [None]:
for data in [df_train, df_test]:
    data['credit_score_FICO_tier'] = data['credit_score'].apply(map_fico_tier)
    data['credit_score_Vantage_tier'] = data['credit_score'].apply(map_vantage_tier)

#### ordinal encoding of the categorical features

In [None]:
ord_encoded_features = categorical_features + ['grade', 'subgrade'] + ['credit_score_FICO_tier', 'credit_score_Vantage_tier']

enc = OrdinalEncoder()
enc.fit(df_train[ord_encoded_features])

df_train[ord_encoded_features] = enc.transform(df_train[ord_encoded_features])
df_test[ord_encoded_features] = enc.transform(df_test[ord_encoded_features])

In [None]:
for col in ord_encoded_features:
    df_train[col] = df_train[col].astype('int32')
    df_test[col] = df_test[col].astype('int32')

#### log-transformation: `annual_income`

In [None]:
for data in [df_train, df_test]:
    data['annual_income'] = np.log(data['annual_income'].values)

#### target

In [None]:
df_train['loan_paid_back'] = df_train['loan_paid_back'].astype('int')

#### set of predictors

In [None]:
predictors = num_features + categorical_features
predictors_ext = predictors + ['grade', 'subgrade'] + ['credit_score_FICO_tier', 'credit_score_Vantage_tier'] + ['loan_to_income_ratio']

target = 'loan_paid_back'

### Vanilla XGBoost model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train[predictors],
                                                  df_train[target],
                                                  train_size = 0.9,
                                                  random_state = 42)

In [None]:
alg = XGBClassifier(n_estimators = 1000,
                    objective = 'binary:logistic',
                    eval_metric = 'auc')

alg.fit(X_train[predictors], y_train, 
        eval_set = [(X_val[predictors], y_val)],
        early_stopping_rounds = 20,
        verbose = 25)

In [None]:
print('Best iteration:', alg.best_iteration)
print('----------')
print('Best AUROC:', alg.best_score)

#### Feature Importance

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(alg)
shap_values = explainer.shap_values(X_val, y_val)

In [None]:
shap.summary_plot(shap_values, X_val, plot_type = "bar")

Thus, `employment_status`, `debt_to_income_ratio`, `credit_score`, and `grade_subgrade` are the most important variables. Among the least important are `gender` and `marital_status`.

### Optuna: hyper-parameters optimisation

In [None]:
def objective(trial):
    params = {
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 100.0, log = True),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "subsample": trial.suggest_float("subsample", 0.25, 1.0, step = 0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.25, 1.0, step = 0.01),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "max_leaves": trial.suggest_int("max_leaves", 10, 50)}
    
    kf = StratifiedKFold(n_splits = 8, shuffle = True, random_state = 42)
    a = kf.split(X = df_train[predictors_ext], y = df_train[target])
    
    oof_pred = np.zeros(len(df_train))
    for i, (train_index, test_index) in enumerate(a): 
        X_train = df_train.loc[train_index, :].copy()
        X_test = df_train.loc[test_index, :].copy()

        ## train / validation split
        X_train, X_val, y_train, y_val = train_test_split(X_train[predictors_ext],
                                                          X_train[target],
                                                          stratify = X_train[target],
                                                          train_size = 0.95,
                                                          random_state = 42)
        X_train[target] = y_train

        # target and frequency encodings of the categorical variables
        te_columns = ord_encoded_features  + ['debt_to_income_ratio', 'credit_score']
        for predictor in te_columns:
            global_mean = X_train[target].mean()
            global_length = len(X_train)
            
            df_pred = X_train.groupby(by = predictor).agg(
                FE = (target, lambda x: len(x) / global_length),
                TE = (target, lambda x: (len(x) * np.mean(x) + 10 * global_mean) / (10 + len(x)))
            ).reset_index()
            df_pred.rename(columns = {'TE': predictor + '_mean',
                                      'FE': predictor + '_freq'}, inplace = True)

            X_train = X_train.merge(df_pred, on = predictor, how = 'left')
            X_val = X_val.merge(df_pred, on = predictor, how = 'left')
            X_test = X_test.merge(df_pred, on = predictor, how = 'left')
                
            X_val[predictor + '_mean'] = X_val[predictor + '_mean'].fillna(global_mean)
            X_val[predictor + '_freq'] = X_val[predictor + '_freq'].fillna(0)

            X_test[predictor + '_mean'] = X_test[predictor + '_mean'].fillna(global_mean)
            X_test[predictor + '_freq'] = X_test[predictor + '_freq'].fillna(0)

        predictors_new = num_features + ['loan_to_income_ratio'] + ord_encoded_features +\
                         [pred + '_mean' for pred in te_columns] + [pred + '_freq' for pred in te_columns]
        eval_set = (X_val[predictors_new], y_val)
        
        ## XGBoost
        alg = XGBClassifier(**params,
                            learning_rate = 0.1,
                            n_estimators = 100000,
                            objective = 'binary:logistic',
                            eval_metric = 'auc')

        alg.fit(X_train[predictors_new], y_train, 
                eval_set = [eval_set],
                early_stopping_rounds = 50,
                verbose = 0)
        oof_pred[test_index] = alg.predict_proba(X_test[predictors_new])[:, 1]

    
    return roc_auc_score(df_train[target], oof_pred)

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction = 'maximize', study_name = 'xgboost')
study.optimize(func = objective, 
               n_trials = 100,
               n_jobs = 4,
               gc_after_trial = False,
               show_progress_bar = True)

In [None]:
print('Best set of hyper-parameters:', study.best_params)
print('---------')
print('Best AUROC:', study.best_value)

### Loan Payback Prediction

In [None]:
kf = StratifiedKFold(n_splits = 8, shuffle = True, random_state = 42)
a = kf.split(X = df_train[predictors_ext], y = df_train[target])

In [None]:
oof_pred = np.zeros(len(df_train))
test_pred = np.zeros(len(df_test))
for i, (train_index, test_index) in enumerate(a): 
    X_train = df_train.loc[train_index, :].copy()
    X_test = df_train.loc[test_index, :].copy()
    df_test_copy = df_test.copy()

    ## train / validation split
    X_train, X_val, y_train, y_val = train_test_split(X_train[predictors_ext],
                                                      X_train[target],
                                                      stratify = X_train[target],
                                                      train_size = 0.95,
                                                      random_state = 42)
    X_train[target] = y_train

    # target and frequency encodings of the categorical variables
    te_columns = ord_encoded_features  + ['debt_to_income_ratio', 'credit_score']
    for predictor in te_columns:
        global_mean = X_train[target].mean()
        global_length = len(X_train)
        
        df_pred = X_train.groupby(by = predictor).agg(
            FE = (target, lambda x: len(x) / global_length),
            TE = (target, lambda x: (len(x) * np.mean(x) + 10 * global_mean) / (10 + len(x)))
        ).reset_index()
        df_pred.rename(columns = {'TE': predictor + '_mean',
                                  'FE': predictor + '_freq'}, inplace = True)
    
        X_train = X_train.merge(df_pred, on = predictor, how = 'left')
        X_val = X_val.merge(df_pred, on = predictor, how = 'left')
        X_test = X_test.merge(df_pred, on = predictor, how = 'left')
        df_test_copy = df_test_copy.merge(df_pred, on = predictor, how = 'left')
            
        X_val[predictor + '_mean'] = X_val[predictor + '_mean'].fillna(global_mean)
        X_val[predictor + '_freq'] = X_val[predictor + '_freq'].fillna(0)

        X_test[predictor + '_mean'] = X_test[predictor + '_mean'].fillna(global_mean)
        X_test[predictor + '_freq'] = X_test[predictor + '_freq'].fillna(0)

        df_test_copy[predictor + '_mean'] = df_test_copy[predictor + '_mean'].fillna(global_mean)
        df_test_copy[predictor + '_freq'] = df_test_copy[predictor + '_freq'].fillna(0)

    predictors_new = num_features + ['loan_to_income_ratio'] + ord_encoded_features +\
                     [pred + '_mean' for pred in te_columns] + [pred + '_freq' for pred in te_columns]
    eval_set = (X_val[predictors_new], y_val)
        
    ## XGBoost
    alg = XGBClassifier(**study.best_params,
                        learning_rate = 0.1,
                        n_estimators = 100000,
                        objective = 'binary:logistic',
                        eval_metric = 'auc')

    alg.fit(X_train[predictors_new], y_train,
            eval_set = [eval_set],
            early_stopping_rounds = 50,
            verbose = 0)
    oof_pred[test_index] = alg.predict_proba(X_test[predictors_new])[:, 1]
    test_pred += alg.predict_proba(df_test_copy[predictors_new])[:, 1]

In [None]:
auc = roc_auc_score(df_train[target], oof_pred)
print("8-Fold CV AUROC: ", auc)

### Submission

In [None]:
test_pred = test_pred / 8.

In [None]:
submission = pd.DataFrame({'id': df_test['id'], 'loan_paid_back': test_pred})
submission.to_csv('/kaggle/working/submission.csv', index = False)