#Loan Status Prediction using Machine Learning with Python

 ┌─────────────────────┐
 │   Loan Dataset CSV  │
 └─────────┬───────────┘
           ↓
 ┌─────────────────────┐
 │ Data Exploration    │
 │ (info, nulls, stats)│
 └─────────┬───────────┘
           ↓
 ┌──────────────────────────────┐
 │ Data Cleaning                │
 │ - Fix "3+" Dependents        │
 │ - Drop Loan_ID               │
 │ - Encode Loan_Status         │
 └─────────┬────────────────────┘
           ↓
 ┌──────────────────────────────┐
 │ Feature Separation           │
 │ X = Features                 │
 │ y = Loan_Status              │
 └─────────┬────────────────────┘
           ↓
 ┌──────────────────────────────┐
 │ Preprocessing Pipeline       │
 │                              │
 │  Numerical Features          │
 │  - Median Imputation         │
 │  - Standard Scaling          │
 │                              │
 │  Categorical Features        │
 │  - Mode Imputation           │
 │  - One-Hot Encoding          │
 └─────────┬────────────────────┘
           ↓
 ┌──────────────────────────────┐
 │ Train / Test Split           │
 │ (Stratified)                 │
 └─────────┬────────────────────┘
           ↓
 ┌──────────────────────────────┐
 │ SVM Model                    │
 │ + GridSearchCV               │
 │ (Hyperparameter Tuning)      │
 └─────────┬────────────────────┘
           ↓
 ┌──────────────────────────────┐
 │ Model Evaluation             │
 │ Accuracy, F1, CV Score       │
 └─────────┬────────────────────┘
           ↓
 ┌──────────────────────────────┐
 │ Loan Approval Prediction     │
 └──────────────────────────────┘


In [70]:
# Core libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# ML utilities
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [71]:
# Dataset path (update if needed)
path = "/content/drive/MyDrive/Colab Notebooks/MachineLearningCourseWithPython/data/train_u6lujuX_CVtuZ9i (1).csv"

loan_dataset = pd.read_csv(path)

# Preview dataset
loan_dataset.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [72]:
loan_dataset.shape


(614, 13)

In [73]:
loan_dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [74]:
loan_dataset.describe()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [75]:
# Check missing values
loan_dataset.isnull().sum()


Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [76]:
# Encode target variable
loan_dataset['Loan_Status'] = loan_dataset['Loan_Status'].map({'N': 0, 'Y': 1})

loan_dataset['Loan_Status'].value_counts()


Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
1,422
0,192


In [77]:
# Convert '3+' → 3 and cast to integer
loan_dataset['Dependents'] = loan_dataset['Dependents'].replace('3+', 3)

loan_dataset['Dependents'].value_counts()


Unnamed: 0_level_0,count
Dependents,Unnamed: 1_level_1
0,345
1,102
2,101
3,51


In [78]:
# Loan_ID has no predictive value
loan_dataset.drop(columns=['Loan_ID'], inplace=True)


In [79]:
X = loan_dataset.drop(columns=['Loan_Status'])
y = loan_dataset['Loan_Status']

print(X.shape, y.shape)


(614, 11) (614,)


In [80]:
numerical_features = [
    'ApplicantIncome',
    'CoapplicantIncome',
    'LoanAmount',
    'Loan_Amount_Term',
    'Credit_History',
    'Dependents'
]

categorical_features = [
    'Gender',
    'Married',
    'Education',
    'Self_Employed',
    'Property_Area'
]


In [81]:
# Numerical pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])


In [82]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])


In [83]:
model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', SVC())
])


In [84]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=42
)

X_train.shape, X_test.shape


((552, 11), (62, 11))

In [85]:
param_grid = {
    'classifier__kernel': ['rbf', 'linear'],
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(
    model,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


In [86]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Best Parameters: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
Best CV Accuracy: 0.807960687960688


In [87]:
best_model = grid_search.best_estimator_

train_predictions = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

print("Training Accuracy:", train_accuracy)


Training Accuracy: 0.8079710144927537


In [88]:
test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.8225806451612904


In [89]:
print(classification_report(y_test, test_predictions))


              precision    recall  f1-score   support

           0       0.90      0.47      0.62        19
           1       0.81      0.98      0.88        43

    accuracy                           0.82        62
   macro avg       0.85      0.73      0.75        62
weighted avg       0.84      0.82      0.80        62



In [90]:
confusion_matrix(y_test, test_predictions)


array([[ 9, 10],
       [ 1, 42]])

In [91]:
cv_scores = cross_val_score(
    best_model,
    X,
    y,
    cv=5,
    scoring='accuracy'
)

print("Cross-Validation Accuracy:", cv_scores.mean())


Cross-Validation Accuracy: 0.809462881514061


In [92]:
# Example input data (same order as X columns)
input_data = {
    'Gender': 'Male',
    'Married': 'Yes',
    'Dependents': 1,
    'Education': 'Graduate',
    'Self_Employed': 'No',
    'ApplicantIncome': 5000,
    'CoapplicantIncome': 0,
    'LoanAmount': 128,
    'Loan_Amount_Term': 360,
    'Credit_History': 1,
    'Property_Area': 'Urban'
}

input_df = pd.DataFrame([input_data])

prediction = best_model.predict(input_df)

if prediction[0] == 1:
    print("Loan Approved")
else:
    print("Loan Not Approved")


Loan Approved
