In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Credit loan payback 

**Goal**: Predict the probability that a borrower will pay back their loan.

Author: [Ejtolf](https://www.kaggle.com/arraylist1402)

In [None]:
import warnings
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
# Default env settings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (6, 5)
%matplotlib inline

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

print('--- TRAIN DATA ---')
display(train_df.head())
print('--- TEST DATA ---')
display(test_df.head())

In [None]:
df = train_df.copy()

# General info
print('-' * 15)
display(df.info())
print('-' * 15)
display(f'Size: {df.shape[0]} returns, {df.shape[1]} features.')
print('-' * 15)
if not df.isna().any().any() and not df.duplicated().sum(): # No msn values & duplicates
    print('No missing values')
    print('No duplicates.')
else:
    print(f'Missing values:')
    display(df.isna().sum())
    display(msno.matrix(df))
    print(f'Duplicates: {df.duplicated().sum()}')

In [None]:
# One client info
df.head(1)

In [None]:
df.columns

## Features overview

There are 593994 clients in train dataset with 13 features:

- **id** – Unique identifier for each record (loan or borrower).  
- **annual_income** – Borrower’s annual income, indicator of repayment ability.  
- **debt_to_income_ratio** – Ratio of total debt to annual income; higher values indicate higher credit risk.  
- **credit_score** – Creditworthiness score, typically from 300 to 850; higher is better.  
- **loan_amount** – The total amount of money borrowed.  
- **interest_rate** – Interest rate applied to the loan; often higher for riskier borrowers.  
- **gender** – Borrower’s gender (e.g., Male, Female, Other).  
- **marital_status** – Marital status of the borrower (Single, Married, Divorced, etc.).  
- **education_level** – Highest education level attained by the borrower (High School, Bachelor, Master, etc.).  
- **employment_status** – Employment condition (Employed, Unemployed, Self-Employed, Retired, etc.).  
- **loan_purpose** – Purpose of the loan (Debt Consolidation, Car, Education, Home Improvement, etc.).  
- **grade_subgrade** – Credit grade assigned by the lender (e.g., A1, B2), reflecting internal risk evaluation.  
- **loan_paid_back** – Target variable: 1 if the loan was repaid, 0 otherwise.  


# 1. Exploratory data analysis.

## 1.1. Previous data visualisation

In [None]:
df = df.drop(columns=['id'])

num_features = df.select_dtypes(include=['int64', 'float64'])
cat_features = pd.concat([df.select_dtypes(exclude=['int64', 'float64']), df['loan_paid_back']], axis=1)
target = 'loan_paid_back'
# cat_features.head()

print(f'Numeric features: {num_features.columns}\n')
print(f'Categorial features: {cat_features.columns}')

In [None]:
for feature in num_features[:-1]:
    sns.histplot(
        data=df,
        x=feature,
        kde=True,
        hue=target,
        bins=30
    )
    plt.title(f'Distribution: {feature.replace("_", " ")}')
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.legend(title='Is loan paid back?', labels=['Yes', 'No'])
    plt.show()

## Short Conclusion
- The visible majority earns up to 100,000–120,000 per year. There is a positive correlation between income level and loan repayment.  
- The higher the debt-to-income ratio, the greater the chance of default (starting from 0.2, almost no borrowers repaid their loans). There is likely a strong dependence between the debt-to-income ratio and default probability.  
- Credit scores start from around 500 points. The higher the credit score, the greater the likelihood of repayment.  
- The most common loan amount is around $30,000. Loan size affects repayment probability but is not a decisive factor.  
- The highest repayment probability is observed at an interest rate of 12.5%.  

In [None]:
for feature in cat_features[:-1]:
    sns.countplot(
        data=df,
        x=feature,
        hue=target,
        palette='viridis'
    )
    plt.title(f'Distribution: {feature.replace("_", " ")}')
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.legend(title='Is loan paid back?', labels=['No', 'Yes'])
    plt.show()

In [None]:
for feature in num_features:
    sns.boxplot(data=df, x=feature)
    plt.title(f'Boxplot (outliers): {feature.replace("_", " ")}')
    plt.show()

## 1.2. Statistical tests

- For numerial - Student's t-test
- For categorial - Chi^2-test

In [None]:
from scipy.stats import ttest_ind, chi2_contingency

In [None]:
def ttest(feature):
    loan_paid_df = df[df['loan_paid_back'] == 1][feature]
    loan_default_df = df[df['loan_paid_back'] == 0][feature]

    t_stat, p_value = ttest_ind(loan_paid_df, loan_default_df, equal_var=False)
    print(f'{feature.replace("_", " ").upper()} is {"SIGNIFICANT" if p_value < .05 else "NOT SIGNIFICANT"}.')
    print(f'T-statistics: {t_stat:.2f} | P-value: {p_value:.5f}')

In [None]:
for feature in num_features:
    print('-' * 15)
    ttest(feature)

In [None]:
def chi2_test(feature):
    observed = pd.crosstab(index=df['loan_paid_back'], columns=df[feature])

    chi2, p_value, dof, expeceted = chi2_contingency(observed) 
    
    print(f'{feature.replace("_", " ").upper()} is {"SIGNIFICANT" if p_value < .05 else "NOT SIGNIFICANT"}.')
    print(f'P-value: {p_value:.5f}')

In [None]:
for feature in cat_features:
    print('-' * 15)
    chi2_test(feature)

## 1.2. Statistically singnificant features
- Significant: annual income, debt to income, credit score, loan amount, interest rate, gender, education evel, employment status, loan purpose, grade subgrade.
- Not significant: marital status

---

# 2. Feature engineering & modeling (basic logistic regression)

## 2.1 Train data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [None]:
def add_features(df):
    df = df.copy()
    df["loan_to_income_ratio"] = df["loan_amount"] / df["annual_income"]
    df["grade_num"] = df["grade_subgrade"].apply(lambda x: ord(x[0]) - 64 + int(x[1]) / 10)
    df["interest_to_credit_score_ratio"] = df["interest_rate"] / df["credit_score"]
    df["payment_to_income_ratio"] = (df["loan_amount"] * (df["interest_rate"] / 100)) / (df["annual_income"] / 12)
    return df

df = add_features(df)
df.head()

In [None]:
X = df.drop(columns=[target, 'marital_status'])
y = df[target]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(exclude=['int64', 'float64']).columns

In [None]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [None]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(C=1, penalty='l2', solver='saga', max_iter=10000, class_weight='balanced'))
])

In [None]:
model.fit(X_train, y_train)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring="roc_auc")
print("Mean AUC:", scores.mean(), "+-", scores.std())

In [None]:
y_pred = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("--> ROC-AUC Score:", roc_auc_score(y_val, y_pred))

In [None]:
param_grid = {
    'classifier__max_iter': [1000, 5000, 10000],
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],
    "classifier__solver": ["saga"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_logreg = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=2
)

grid_search_logreg.fit(X, y)

print('-' * 45)
print("Best parameters:", grid_search_logreg.best_params_)
print("Best ROC-AUC:", grid_search_logreg.best_score_)
print('-' * 45)

## 2.2 Test data

In [None]:
X_test_processed = add_features(test_df).copy()
test_predictions = model.predict(X_test_processed)
test_probabilities = model.predict_proba(X_test_processed)

# test_predictions[:10]
test_probabilities[0]

In [None]:
encoder = model.named_steps['preprocessor'].named_transformers_['cat']
ohe_features = encoder.get_feature_names_out(cat_cols)

# Features
features = np.concatenate([num_cols, ohe_features])
# Coefficients
coefs = model.named_steps['classifier'].coef_.flatten()

feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefs,
    'Abs_importance': np.abs(coefs)
}).sort_values('Abs_importance', ascending=False)

print('10 most significant features')

feature_importance.head(20)

# 3. Ensembles 

In [None]:
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [None]:
pipe_lgb = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.05, random_state=42))
])

pipe_xgb = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        n_estimators=500, 
        learning_rage=.05, 
        max_depth=6,
        tree_method='hist',
        random_state=42
    ))
])

In [None]:
lgb_model = pipe_lgb.fit(X_train, y_train)
display(lgb_model)

y_pred = lgb_model.predict(X_val)
y_pred_proba = lgb_model.predict_proba(X_val)[:, 1]

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("--> ROC-AUC Score:", roc_auc_score(y_val, y_pred_proba))

In [None]:
xgb_model = pipe_xgb.fit(X_train, y_train)
display(xgb_model)

y_pred = xgb_model.predict(X_val)
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("--> ROC-AUC Score:", roc_auc_score(y_val, y_pred_proba))

In [None]:
test_df = add_features(test_df)

test_df.head()

In [None]:
# LightGBM predictions fixation  
test_probs = lgb_model.predict_proba(test_df)[:, 1]
test_preds = lgb_model.predict(test_df)

# Pasting in test df
test_df["loan_paid_back_prob"] = test_probs
test_df["loan_paid_back_pred"] = test_preds

test_df.head(10)

In [None]:
submission = test_df[['id']].copy()
submission['loan_paid_back'] = test_probs

submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()