# Predicting Loan Payback - Kaggle Competition

## Importing Libraries

In [53]:
# Importing Libraries
import pandas as pd
import numpy as np
from zipfile import ZipFile
import joblib

# Sklearn Imports 
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, power_transform
from sklearn.ensemble import RandomForestClassifier

# Ignore Warnings
import warnings 
warnings.filterwarnings('ignore')

## Loading the dataset

In [54]:
# loading the dataset
path= r"C:\Users\OLASQUARE\Downloads\Compressed\playground-series-s5e11_2.zip"
with ZipFile(path, 'r') as zip_ref:
    # loading the train data
    with zip_ref.open("train.csv") as train:
        train_raw= pd.read_csv(train)

    # loading the test data
    with zip_ref.open("test.csv") as test:
        test_raw= pd.read_csv(test)

In [55]:
#   making the copy of the data
train_data=train_raw.copy()
test_data=test_raw.copy()

## Preprocesssing

From the EDA, there are right skewed and left skew observed.. Power transformation is recommender

In [56]:
# Let split the data to X and y
X= train_data.drop(columns=['loan_paid_back','id'])
y= train_data['loan_paid_back']

# let remove the id column from test data
test_data= test_data.drop(columns=['id'])

In [57]:
# let power transform the numerical columns to reduce skewness
def handle_skewness(df, columns:dict):
    df=df.copy()
    df[columns]= power_transform(df[columns], method='yeo-johnson')
    return df

# let treat the skewness of numerical columns in X
num_col= X.select_dtypes(include='number').columns
X= handle_skewness(X, num_col)

# let treat the skewness of numerical columns in test data too
num_col_test= test_data.select_dtypes(include='number').columns
test_data= handle_skewness(test_data, num_col_test)


### Encoding categorical data

In [58]:
cat_col= train_data.select_dtypes(include='object').columns
cat_col

Index(['gender', 'marital_status', 'education_level', 'employment_status',
       'loan_purpose', 'grade_subgrade'],
      dtype='object')

In [59]:
# one hot encoding categorical columns
def one_hot_encoder(df, columns:dict):
    df=df.copy()
    df= pd.get_dummies(df, columns=columns, drop_first=True)
    return df

# based on EDA column to be encoded [gender,marital_status,employment_status]
cat=['gender','marital_status','employment_status','loan_purpose']
X= one_hot_encoder(X, cat)

# let one hot encode the categorical columns in test data too
test_data= one_hot_encoder(test_data, cat)


In [60]:
# manual encoding target column [education_level,grade_subgrade]
def manual_encoder(df, column:str, mapping:dict):
    df=df.copy()
    df[column]= df[column].map(mapping)
    return df


In [61]:
# encoding education_level
edu_order = {
    "High School": 1,
    "Bachelor's": 2,
    "Master's": 3,
    "PhD": 4,
    "Other": 0
}
X= manual_encoder(X, 'education_level', edu_order)

# let manual encode education_level in test data too
test_data= manual_encoder(test_data, 'education_level', edu_order)

In [62]:
# ecoding grade_subgrade
grade_order = [
    'A1','A2','A3','A4','A5',
    'B1','B2','B3','B4','B5',
    'C1','C2','C3','C4','C5',
    'D1','D2','D3','D4','D5',
    'E1','E2','E3','E4','E5',
    'F1','F2','F3','F4','F5']

grade_mapping= {grade: idx+1 for idx, grade in enumerate(grade_order)}
X= manual_encoder(X, 'grade_subgrade', grade_mapping)

# let manual encode grade_subgrade in test data too
test_data= manual_encoder(test_data, 'grade_subgrade', grade_mapping)

### Scaling

In [63]:
# let scale the numerical columns using minmax scaler 
scaler= MinMaxScaler()
X[num_col]= scaler.fit_transform(X[num_col])

# let scale the numerical columns in test data too
test_data[num_col]= scaler.transform(test_data[num_col])

### Feature selection

In [64]:
# let select features based on correlation
corr= X.corrwith(y).abs()
corr_selected= corr[corr>0.1].index.tolist()

# showing the correlation heatmap
# plt.figure(figsize=(10,6))
# sns.heatmap(X[corr_selected].corr(), annot=True, cmap='coolwarm')
# plt.title("Correlation Heatmap of Selected Features")
# plt.show()

# checking multicoli
# selected feature correlate with each other
print(f"selected features before checking multicollinearity:\n{corr_selected}")
print("......................\n")
print("Checking multicollinearity among selected features...")
if np.any(X[corr_selected].corr().abs() > 0.8):
    print("Multicollinearity detected among selected features.")
    # dropping one of the correlated features
    upper_tri= X[corr_selected].corr().abs().where(np.triu(np.ones(X[corr_selected].corr().shape), k=1).astype(bool))
    to_drop= [column for column in upper_tri.columns if any(upper_tri[column]>0.8)]
    selected_features= [feature for feature in corr_selected if feature not in to_drop]
    print(f"Final selected features after removing multicollinearity:\n{selected_features}")
else:
    print("No multicollinearity detected among selected features.")
    selected_features= corr_selected
    print(f"Final selected features:\n{selected_features}")


# let save our resources for future use
# joblib.dump(scaler, 'minmax_scaler.pkl')
# joblib.dump(selected_features, 'selected_features.pkl')
# joblib.dump(grade_mapping, 'grade_mapping.pkl')
# joblib.dump(edu_order, 'edu_order.pkl')

selected features before checking multicollinearity:
['debt_to_income_ratio', 'credit_score', 'interest_rate', 'grade_subgrade', 'employment_status_Student', 'employment_status_Unemployed']
......................

Checking multicollinearity among selected features...
Multicollinearity detected among selected features.
Final selected features after removing multicollinearity:
['debt_to_income_ratio', 'credit_score', 'interest_rate', 'employment_status_Student', 'employment_status_Unemployed']


## Modelling

### Split Data

In [65]:
# let split our data
X_final= X[selected_features]
X_train, X_val, y_train, y_val= train_test_split(X_final, y, test_size=0.2, random_state=42, stratify=y)

### Initialize Models

In [66]:
# # initialize models
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000, random_state=234),
#     "Random Forest": RandomForestClassifier(random_state=234),
#     # "Decision Tree": DecisionTreeClassifier(random_state=234),
#     # "KNN": KNeighborsClassifier()
# }

### Train and Evaluate model

In [67]:
# let train those models


# for name, model in models.items():
#     model.fit(X_train, y_train)
#     # save the model for future use
#     joblib.dump(model, f'{name}.pkl')

#     # model.dump(f"{name}.pkl")
#     # y_val_pred= model.predict(X_val)

#     # roc_auc= roc_auc_score(y_val, model.predict_proba(X_val)[:,1])
#     # print(f"{name} Validation ROC-AUC Score: {roc_auc:.4f}")
#     # print(classification_report(y_val, y_val_pred))
#     # print("=="*50)
#     # print("")

### Tunning the best trained Model

Since Logistic Regression and Random Forest are very close. So, I will tune Random Forest for stability and interpretability of feature importance.

In [68]:
# let tune the best trained Model using Random search
rf = RandomForestClassifier(random_state=234, n_jobs=-1)

# parameter random for tuning
# new changes
param_dist = {
    'n_estimators': [100,200,400],
    'max_depth': [8, 12, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

# new Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=8,
    scoring='roc_auc',
    cv=3,
    random_state=234,
    n_jobs=-1
)
# Fit on your training data
random_search.fit(X_train, y_train)

print("Best parameters found:")
print(random_search.best_params_)
print(f"Best CV AUC: {random_search.best_score_:.4f}")

# save the best model
best_model = random_search.best_estimator_


Best parameters found:
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 12}
Best CV AUC: 0.9137


### Evaluating the best model on test data

In [69]:
# # let do cross validation for the best estimator
# cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
# print(f"Cross-validation scores for Random Forest: {cv_scores}")
# print(f"Mean Cross-validation score: {np.mean(cv_scores):.2f}")

### Predict with Best tuned model

In [71]:
# Predict with Best tuned model
test_df=test_data[selected_features] # selecting only the features used in training

# predicting with the best model
y_test_pred= best_model.predict_proba(test_df)[:, 1]

# preparing submission file
submission= pd.DataFrame({
    'id': test_raw['id'],
    'loan_paid_back': y_test_pred
})
print(submission.head(10))
submission.to_csv('loan_payback_predictions3.csv', index=False)

       id  loan_paid_back
0  593994        0.918884
1  593995        0.982649
2  593996        0.475085
3  593997        0.942308
4  593998        0.951999
5  593999        0.973974
6  594000        0.983233
7  594001        0.969953
8  594002        0.933707
9  594003        0.008908
