Loan Payback with XGBoost and Feature Engineering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# import the necessary libraries

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

# load the data

In [None]:
test_ids = test['id']

# save the id column

In [None]:
train.head()

In [None]:
test.head()

# Feature Engineering

Train Data

In [None]:
train['loan_to_income_ratio'] = train['loan_amount'] / train['annual_income']

# create a new column to show loan burden relative to income

In [None]:
train['is_high_risk'] = (
    (train['debt_to_income_ratio'] > 0.4) |
    (train['grade_subgrade'].str.startswith(('E', 'F', 'G')))
).astype(int)


# create a new column to flag high risk borrowers
# if debt_to_income_ratio > 0.4 or grade_subgrade starts with E/F/G = 1
# otherwise = 0

In [None]:
def classify_employment(status):
    if status in ['Employed', 'Self-employed']:
        return 'Stable'
    elif status in ['Part-time', 'Retired']:
        return 'Moderate'
    elif status in ['Unemployed']:
        return 'Unstable'
    else:
        return 'Unknown'

train['employment_stability'] = train['employment_status'].apply(classify_employment)

# create a new column to classify employment stability
# if employment_status is Employed or Self-employed = Stable
# if Part-time or Retired = Moderate
# if Unemployed = Unstable
# otherwise = Unknown

Test Data

In [None]:
test['loan_to_income_ratio'] = test['loan_amount'] / test['annual_income']

# create a new column to show loan burden relative to income

In [None]:
test['is_high_risk'] = (
    (test['debt_to_income_ratio'] > 0.4) |
    (test['grade_subgrade'].str.startswith(('E', 'F', 'G')))
).astype(int)


# create a new column to flag high risk borrowers
# if debt_to_income_ratio > 0.4 or grade_subgrade starts with E/F/G = 1
# otherwise = 0

In [None]:
def classify_employment(status):
    if status in ['Employed', 'Self-employed']:
        return 'Stable'
    elif status in ['Part-time', 'Retired']:
        return 'Moderate'
    elif status in ['Unemployed']:
        return 'Unstable'
    else:
        return 'Unknown'

test['employment_stability'] = test['employment_status'].apply(classify_employment)

# create a new column to classify employment stability
# if employment_status is Employed or Self-employed = Stable
# if Part-time or Retired = Moderate
# if Unemployed = Unstable
# otherwise = Unknown

# Clean The Train Data


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

columnns = ['gender','marital_status','education_level','employment_status','loan_purpose','grade_subgrade','employment_stability']

for i in columnns :
  train[i] = encoder.fit_transform(train[i])

# preprocess the train data

In [None]:
train.drop(columns=['id'],inplace=True)

# drop the unnecessary column

In [None]:
train.isnull().sum().sum()

# check for missing values

In [None]:
train.duplicated().sum()

# check for duplicates values

# Clean The Test Data

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

columnns = ['gender','marital_status','education_level','employment_status','loan_purpose','grade_subgrade','employment_stability']

for i in columnns :
  test[i] = encoder.fit_transform(test[i])

# preprocess the train data

In [None]:
test.drop(columns=['id'],inplace=True)

# drop the unnecessary column

In [None]:
test.isnull().sum().sum()

# check for missing values

In [None]:
test.duplicated().sum()

# check for duplicates values

# Exploratory Data Analysis ( EDA )

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='annual_income',data=train)
plt.title("Annual Income Distribution")
plt.show()

# explore the data with histplot

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='debt_to_income_ratio',data=train)
plt.title("Debt to Income Ratio Distribution")
plt.show()

# explore the data with histplot

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='credit_score',data=train)
plt.title("Credit Score Distribution")
plt.show()

# explore the data with histplot

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='loan_amount',data=train)
plt.title("Loan Amount Distribution")
plt.show()

# explore the data with histplot

In [None]:
plt.figure(figsize=(8,8))
sns.histplot(x='interest_rate',data=train)
plt.title("Interest Rate Distribution")
plt.show()

# explore the data with histplot

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='gender',data=train)
plt.xticks(ticks=[0,1,2],
           labels=['Female','Male','Other'])
plt.title('Gender Distribution')
plt.show()

# explore the data with countplot

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='marital_status',data=train)
plt.xticks(ticks=[0,1,2,3],
           labels=['Divorced','Married','Single','Widowed'])
plt.title('Marital Status Distribution')
plt.show()

# explore the data with countplot

In [None]:
education = train['education_level'].value_counts()

Labesl = ['Bachelor','High School','Master','Other','PhD']

plt.figure(figsize=(8,8))
plt.pie(education,autopct='%1.1f%%',labels=Labesl,wedgeprops={'edgecolor':"black"})
plt.title('Education Level Distribution')
plt.show()

# explore the data with pieplot

In [None]:
employment = train['employment_status'].value_counts()

Labesl = ['Employed','Unemployed','Self-employed','Retired','Student']

plt.figure(figsize=(8,8))
plt.pie(employment,autopct='%1.1f%%',labels=Labesl,wedgeprops={'edgecolor':"black"})
plt.title('Employment Status Distribution')
plt.show()

# explore the data with pieplot

In [None]:
loan = train['loan_purpose'].value_counts()

Labesl = ['Debt consolidation','Other','Car','Home','Education','Business','Medical','Vacation']

plt.figure(figsize=(8,8))
plt.pie(loan,autopct='%1.1f%%',labels=Labesl,wedgeprops={'edgecolor':"black"})
plt.title('Loan Purpose Distribution')
plt.show()

# explore the data with pieplot

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x='employment_stability',data=train)
plt.xticks(ticks=[0,1,2,3],
           labels=['Moderate','Stable','Unknown','UnStable'])
plt.title('Employment Stability Distribution')
plt.show()

# explore the data with countplot

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train.corr(),annot=True)
plt.title('Heat Map Correlation')
plt.show()

# explore the data with heatmap

# Build The Model

In [None]:
x = train.drop(columns=['loan_paid_back'],axis=1)
y = train['loan_paid_back']

In [None]:
from sklearn.model_selection import train_test_split

x_train , x_valid , y_train , y_valid = train_test_split(x,y,test_size=0.3,random_state=42)

# split the data

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42, verbosity=0)

xgb_params = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 1, 5],
    "reg_alpha": [0, 0.1, 1],
    "reg_lambda": [1, 5, 10],
    "min_child_weight": [1, 3, 5]
}

xgb_random = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params,
    n_iter=50,
    cv=5,
    scoring="r2",
    n_jobs=-1,
    verbose=1,
    random_state=42
)

xgb_random.fit(x_train, y_train)

print("Best XGB params:", xgb_random.best_params_)
print("Best XGB score:", xgb_random.best_score_)

# use RandomizedSearchCV to find the best hyperparameters

In [None]:
from xgboost import XGBClassifier


Model = XGBClassifier(
     subsample=1.0,
    reg_lambda=5,
    reg_alpha=1,
    n_estimators=200,
    min_child_weight=1,
    max_depth=7,
    learning_rate=0.1,
    gamma=0,
    colsample_bytree=0.6,
    random_state=42,
    verbosity=0
)

Model.fit(x_train,y_train)

# train the model

# Model Evaluation

In [None]:
from sklearn.metrics import roc_auc_score


y_proba = Model.predict_proba(x_valid)[:, 1]


auc_score = roc_auc_score(y_valid, y_proba)
print(f"AUC Score: {auc_score:.4f}")

# evaluate the model with AUC score

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_valid, y_proba)


plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'XGBoost (AUC = {auc_score:.3f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# evaluate the model with ROC curve

In [None]:
from xgboost import plot_importance

plt.figure(figsize=(12, 8))
plot_importance(Model, max_num_features=20, importance_type='gain', height=0.6)
plt.title("Top 20 Feature Importances")
plt.show()

# visualize feature importance

# Submission

In [None]:
predictions = Model.predict_proba(test)[:, 1]

In [None]:
submission = pd.DataFrame({
    "id": test_ids,
    "accedint_risk": predictions
})

submission.to_csv("submission.csv", index=False)

In [None]:
submission