In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade scikit-learn imbalanced-learn --quiet


# Loan Approval Prediction Project

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (accuracy_score, confusion_matrix, roc_auc_score, 
                             classification_report, precision_score, recall_score, f1_score, RocCurveDisplay)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

## Load the dataset

In [None]:
data = pd.read_csv('/kaggle/input/loan-approval-prediction-dataset/loan_approval_dataset.csv')

In [None]:
data.columns = data.columns.str.strip()

In [None]:
print("Dataset Shape:", data.shape)


In [None]:
display(data.head())

## 1. Data Preprocessing

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
print("\nMissing Values:\n", data.isna().sum())

In [None]:
# --- Encode categorical variables ---
le = LabelEncoder()
for col in data.select_dtypes(include=['object']).columns:
    data[col] = le.fit_transform(data[col])

In [None]:
# --- Target variable ---
target = 'loan_status'
X = data.drop(['loan_id', target], axis=1)
y = data[target]

In [None]:
# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# --- Standardize numeric features ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 2. Exploratory Data Analysis

In [None]:
# --- Class Distribution ---
plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title("Loan Status Distribution")
plt.show()

In [None]:
# --- Correlation Heatmap ---
plt.figure(figsize=(10,6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# --- Pairplot of selected features ---
sns.pairplot(
    data[['income_annum', 'loan_amount', 'cibil_score', 'loan_status']], 
    hue="loan_status"
)
plt.show()

# 3. Model Training Function

In [None]:
def train_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
        
    print(f"\n==================== {model_name} ====================")
    print("Accuracy:", accuracy_score(y_test, y_pred) * 100)
    print("Precision:", precision_score(y_test, y_pred) * 100)
    print("Recall:", recall_score(y_test, y_pred) * 100)
    print("F1 Score:", f1_score(y_test, y_pred) * 100)
    print("ROC AUC Score:", roc_auc_score(y_test, y_pred) * 100)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # Confusion Matrix inside function
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Approved','Rejected'], 
                yticklabels=['Approved','Rejected'])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()

   # --- ROC Curve ---
    RocCurveDisplay.from_estimator(model, X_test, y_test)
    plt.title(f'ROC Curve - {model_name}')
    plt.show()


# 4. Models to Compare

In [None]:
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('SVM', SVC(probability=True, random_state=42))
]


In [None]:

# --- Train & Evaluate ---
for name, model in models:
    train_evaluate(model, X_train, y_train, X_test, y_test, name)

# 5. Cross-validation (Bonus)

In [None]:
for name, model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    print(f"{name} - Average F1 Score (CV=5): {scores.mean():.4f}")