In [4]:
# ============================================================
# LOAN DEFAULT PREDICTION – COMPLETE END-TO-END CODE (GROUP 1)
# ============================================================
# Requirements:
# pip install pandas numpy scikit-learn matplotlib seaborn
#
# Dataset:
# Download a loan default dataset from Kaggle
# Rename it to: loan_data.csv
# Target column must be named: "default"
# (1 = Defaulter, 0 = Non-defaulter)
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    roc_curve
)

# ============================================================
# 1. LOAD DATA
# ============================================================

data = pd.read_csv("loan_data.csv")

# ============================================================
# 2. REMOVE UNNECESSARY / LEAKAGE COLUMNS
# ============================================================

drop_cols = ["customer_id", "name", "application_id"]
data.drop(columns=[c for c in drop_cols if c in data.columns], inplace=True)

# ============================================================
# 3. HANDLE MISSING VALUES
# ============================================================

for col in data.select_dtypes(include=["int64", "float64"]).columns:
    data[col].fillna(data[col].median(), inplace=True)

for col in data.select_dtypes(include="object").columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

# ============================================================
# 4. REMOVE DUPLICATE ROWS
# ============================================================

data.drop_duplicates(inplace=True)

# ============================================================
# 5. OUTLIER REMOVAL USING IQR
# ============================================================

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

numeric_cols = data.select_dtypes(include=["int64", "float64"]).columns
numeric_cols = numeric_cols.drop("default")

for col in numeric_cols:
    data = remove_outliers_iqr(data, col)

# ============================================================
# 6. ENCODE CATEGORICAL VARIABLES
# ============================================================

encoder = LabelEncoder()
for col in data.select_dtypes(include="object").columns:
    data[col] = encoder.fit_transform(data[col])

# ============================================================
# 7. FEATURE – TARGET SPLIT
# ============================================================

X = data.drop("default", axis=1)
y = data["default"]

# ============================================================
# 8. TRAIN – TEST SPLIT (STRATIFIED)
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ============================================================
# 9. FEATURE SCALING
# ============================================================

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ============================================================
# 10. BASELINE MODEL – LOGISTIC REGRESSION
# ============================================================

lr = LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

print("\n===== LOGISTIC REGRESSION =====")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

# ============================================================
# 11. ADVANCED MODEL – RANDOM FOREST
# ============================================================

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("\n===== RANDOM FOREST =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# ============================================================
# 12. ROC CURVE COMPARISON
# ============================================================

fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)

plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label="Logistic Regression")
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

# ============================================================
# 13. FEATURE IMPORTANCE (RANDOM FOREST)
# ============================================================

feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nTop Important Features:")
print(feature_importance.head(10))

# ============================================================
# 14. PREDICTION ON NEW APPLICANT
# ============================================================

# Example input (modify order as per dataset columns)
new_customer = np.array([[35, 60000, 250000, 710, 6, 36, 11.2]])
new_customer = scaler.transform(new_customer)

prediction = rf.predict(new_customer)

print("\nPrediction for New Applicant:")
print("Defaulter" if prediction[0] == 1 else "Non-Defaulter")

ModuleNotFoundError: No module named 'seaborn'