In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV

In [None]:
data = pd.read_csv('/content/drive/MyDrive/loan predictor/loan_data.csv')

In [None]:
data.head()

In [None]:
data = data.drop(columns=['loan_id'])

In [None]:
data.shape

In [None]:
data.duplicated().sum()

In [None]:
data.describe()

In [None]:
print(data.columns.tolist())

In [None]:
data.columns = data.columns.str.strip()
data

In [None]:
(data['residential_assets_value'] < 0).sum()

In [None]:
# Negative residential assets
print(f"Negative residential assets: {(data['residential_assets_value'] < 0).sum()}")

In [None]:
data.loc[data['residential_assets_value'] < 0, 'residential_assets_value'] = 0

In [None]:
print(data[data['bank_asset_value'] == 0].shape[0])

In [None]:
data.isnull().sum()

In [None]:
data['loan_status'] = data['loan_status'].str.strip()
data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0})

In [None]:
data['loan_status'].value_counts(normalize=True)

as we can there there's class imbalance

I performed feature engineering to address the multicollinearity I found in the heatmap.


1. Interaction Feature: I created a loan_to_income ratio. This captures the applicant's repayment burden, which is a much stronger predictor of risk than raw income alone.


2. Dimensionality Reduction: Since the asset classes were highly correlated, I aggregated them into total_assets.

In [None]:
data['loan_to_income_ratio'] = data['loan_amount'] / data['income_annum']

data['total_assets'] = (data['residential_assets_value'] +
                        data['commercial_assets_value'] +
                        data['luxury_assets_value'] +
                        data['bank_asset_value'])

Dropping Unnecessary Columns

In [None]:
data = data.drop(columns=['loan_amount', 'income_annum',
                          'residential_assets_value', 'commercial_assets_value',
                          'luxury_assets_value', 'bank_asset_value'])

In [None]:
X = data.drop('loan_status', axis=1)
y = data['loan_status']

In [None]:
X_train,X_test,y_train,y_test =train_test_split( X, y, test_size = 0.2, random_state=42,stratify=y)

In [None]:
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution plots
X_train.hist(figsize=(15, 10))

In [None]:
# Box plots for outliers
for col in num_cols:
    plt.figure()
    sns.boxplot(x=X_train[col])

In [None]:
#Correlation Heatmap codedat
plt.figure(figsize=(12, 8))
corr_matrix = X_train[num_cols].corr()  # Use num_cols
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "XGBoost": XGBClassifier(eval_metric='logloss', scale_pos_weight=10)
    # Note: XGBoost uses 'scale_pos_weight' instead of 'class_weight' for imbalance
}

In [None]:
for name, model in models.items():
    # Create the full pipeline: Preprocess -> Model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Fit the pipeline
    pipeline.fit(X_train, y_train)


    y_pred = pipeline.predict(X_test)
    print(f"--- {name} Report ---")
    print(classification_report(y_test, y_pred))
    print("\n")

In [None]:
# overfitting Check
from sklearn.metrics import accuracy_score

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)

    # Train score
    y_train_pred = pipeline.predict(X_train)
    train_score = accuracy_score(y_train, y_train_pred)

    # Test score
    y_test_pred = pipeline.predict(X_test)
    test_score = accuracy_score(y_test, y_test_pred)

    print(f"\n--- {name} ---")
    print(f"Train Accuracy: {train_score:.4f}")
    print(f"Test Accuracy: {test_score:.4f}")
    print(f"Difference: {abs(train_score - test_score):.4f}")
    print(classification_report(y_test, y_test_pred))

In [None]:
#XGBoost pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', models['XGBoost'])])
model.fit(X_train, y_train)

import shap

# 1. Setup Explainer
explainer = shap.TreeExplainer(model.named_steps['classifier'])

# 2. Transform Data
X_train_transformed = model.named_steps['preprocessor'].transform(X_train)
if hasattr(X_train_transformed, 'toarray'):
    X_train_transformed = X_train_transformed.toarray()

# 3. Calculate SHAP values
shap_values = explainer.shap_values(X_train_transformed)

# 4. Summary Plot
shap.summary_plot(shap_values, X_train_transformed, feature_names=model.named_steps['preprocessor'].get_feature_names_out())

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train_transformed[0,:], feature_names=model.named_steps['preprocessor'].get_feature_names_out())

In [None]:
# model saving
import joblib

model_path = '/content/drive/MyDrive/loan predictor/loan_model.joblib'
joblib.dump(model, model_path)