<a href="https://colab.research.google.com/github/S-Shivaprasad/Loan_Approval_Prediction/blob/main/Loan_Approval_Prediction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 📚 Importing Required Libraries


In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

## 📂 Loading the Dataset


In [47]:
df=pd.read_csv("/content/train_u6lujuX_CVtuZ9i.csv")

## 🔍 Exploratory Data Analysis (EDA)
### - Checking dataset shape, head, info, and missing values


In [48]:
df.shape

(614, 13)

In [49]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [51]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [52]:
# 🔧 Create Total_Income = ApplicantIncome + CoapplicantIncome
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.drop(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome'], axis=1, inplace=True)

In [53]:
# 🔍 Handle missing values
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [54]:
df.isnull().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0
Property_Area,0
Loan_Status,0


In [55]:
# 🎯 Encode categorical variables
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [56]:
# 🔄 Split features and label
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [57]:
df['Loan_Status'].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
1,422
0,192


In [58]:
# ⚖️ Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [59]:
# 🔧 Scale features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

## 🧪 Train-Test Split


In [60]:
# 🚀 Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


## 📊 Model Evaluation
### - Accuracy, Precision, Recall, F1-score
### - Confusion Matrix


In [61]:
# 🧠 Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(kernel='rbf', probability=True)
}

# 📊 Train & Evaluate
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    print(f"📈 Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"📃 Classification Report:\n{classification_report(y_test, y_pred)}")

    if y_proba is not None:
        print(f"🎯 ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")


🔹 Training Logistic Regression...
📈 Confusion Matrix:
[[60 34]
 [10 65]]
📃 Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.64      0.73        94
           1       0.66      0.87      0.75        75

    accuracy                           0.74       169
   macro avg       0.76      0.75      0.74       169
weighted avg       0.77      0.74      0.74       169

🎯 ROC-AUC Score: 0.7878

🔹 Training Random Forest...
📈 Confusion Matrix:
[[68 26]
 [10 65]]
📃 Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.72      0.79        94
           1       0.71      0.87      0.78        75

    accuracy                           0.79       169
   macro avg       0.79      0.80      0.79       169
weighted avg       0.80      0.79      0.79       169

🎯 ROC-AUC Score: 0.8228

🔹 Training XGBoost...
📈 Confusion Matrix:
[[69 25]
 [15 60]]
📃 Classification Report:
              preci

Parameters: { "use_label_encoder" } are not used.



📃 Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.59      0.71        94
           1       0.64      0.91      0.75        75

    accuracy                           0.73       169
   macro avg       0.76      0.75      0.73       169
weighted avg       0.78      0.73      0.72       169

🎯 ROC-AUC Score: 0.7753


## 🧠 Model Training and Hyperparameter Tuning
### - Logistic Regression
### - Random Forest
### - XGBoost


In [62]:
# ----------------------
# Logistic Regression
# ----------------------
logreg = LogisticRegression(solver='liblinear', random_state=42)
logreg_params = {
    'C': [0.01, 0.1, 1],             # Inverse of regularization strength
    'penalty': ['l1', 'l2']          # L1 = Lasso, L2 = Ridge
}

logreg_grid = GridSearchCV(logreg, logreg_params, cv=3, scoring='f1', n_jobs=-1)
logreg_grid.fit(X_train, y_train)

# ----------------------
# Random Forest
# ----------------------
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [100, 150],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}

rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# ----------------------
# XGBoost
# ----------------------
xgb = XGBClassifier(eval_metric="logloss")
xgb_params = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'reg_alpha': [0, 0.1, 1],       # L1 regularization
    'reg_lambda': [1, 5, 10]        # L2 regularization
}

xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring='f1', n_jobs=-1)
xgb_grid.fit(X_train, y_train)



In [63]:
# ----------------------
# 📢 Print Best Results
# ----------------------
print("✅ Best Logistic Regression:", logreg_grid.best_params_)
print("✅ Best Random Forest:", rf_grid.best_params_)
print("✅ Best XGBoost:", xgb_grid.best_params_)

# Store Best Models
best_logreg = logreg_grid.best_estimator_
best_rf = rf_grid.best_estimator_
best_xgb = xgb_grid.best_estimator_

✅ Best Logistic Regression: {'C': 0.1, 'penalty': 'l1'}
✅ Best Random Forest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 150}
✅ Best XGBoost: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 5}


## 📊 Model Evaluation
### - Accuracy, Precision, Recall, F1-score
### - Confusion Matrix


In [64]:
# Helper function to evaluate a model
def evaluate_model(model, X_test, y_test, name="Model"):
    print(f"\n🔹 Evaluating {name}...")

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    print("📈 Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\n📃 Classification Report:")
    print(classification_report(y_test, y_pred))

    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A"

    print(f"✅ Accuracy: {acc:.4f}")
    print(f"🎯 ROC-AUC Score: {roc_auc if roc_auc != 'N/A' else 'Not Available'}")

# Run evaluation for all tuned models
evaluate_model(best_logreg, X_test, y_test, "Logistic Regression")
evaluate_model(best_rf, X_test, y_test, "Random Forest")
evaluate_model(best_xgb, X_test, y_test, "XGBoost")



🔹 Evaluating Logistic Regression...
📈 Confusion Matrix:
[[54 40]
 [ 4 71]]

📃 Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.57      0.71        94
           1       0.64      0.95      0.76        75

    accuracy                           0.74       169
   macro avg       0.79      0.76      0.74       169
weighted avg       0.80      0.74      0.73       169

✅ Accuracy: 0.7396
🎯 ROC-AUC Score: 0.7904964539007093

🔹 Evaluating Random Forest...
📈 Confusion Matrix:
[[63 31]
 [ 8 67]]

📃 Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.67      0.76        94
           1       0.68      0.89      0.77        75

    accuracy                           0.77       169
   macro avg       0.79      0.78      0.77       169
weighted avg       0.80      0.77      0.77       169

✅ Accuracy: 0.7692
🎯 ROC-AUC Score: 0.8272340425531916

🔹 Evaluating XGBoost...
📈 Confusion M

## 🤖 Ensemble Learning: Soft Voting Classifier
### - Combining best Random Forest and XGBoost


In [65]:
# Soft Voting Ensemble with best Random Forest & XGBoost models
voting_clf = VotingClassifier(
    estimators=[
        ('RandomForest', best_rf),
        ('XGBoost', best_xgb)
    ],
    voting='soft'  # Use predicted probabilities
)

# Train the ensemble on your training data
voting_clf.fit(X_train, y_train)


## ✅ Final Evaluation on Test Set


In [66]:
# Predict and evaluate
y_pred = voting_clf.predict(X_test)
y_prob = voting_clf.predict_proba(X_test)[:, 1]  # For ROC-AUC

print("📈 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n📃 Classification Report:")
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"✅ Accuracy: {acc:.4f}")
print(f"🎯 ROC-AUC Score: {roc_auc:.4f}")


📈 Confusion Matrix:
[[63 31]
 [ 7 68]]

📃 Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.67      0.77        94
           1       0.69      0.91      0.78        75

    accuracy                           0.78       169
   macro avg       0.79      0.79      0.77       169
weighted avg       0.81      0.78      0.77       169

✅ Accuracy: 0.7751
🎯 ROC-AUC Score: 0.8335
