In [79]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os

# Authenticate Kaggle API
api = KaggleApi()
api.authenticate()

# Correct dataset name
dataset_name = 'bhavikjikadara/loan-status-prediction'  # Replace with actual dataset name

# Define the download path (use raw string for Windows path)
download_path = r"D:\ML_Projects\Loan_Status_Prediction"

# Create the folder if it doesn’t exist
os.makedirs(download_path, exist_ok=True)

# Download and unzip the dataset
api.dataset_download_files(dataset_name, path=download_path, unzip=True)

print(f"Dataset downloaded successfully to: {download_path}")

Dataset URL: https://www.kaggle.com/datasets/bhavikjikadara/loan-status-prediction
Dataset downloaded successfully to: D:\ML_Projects\Loan_Status_Prediction


In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import joblib

# Load the dataset
df = pd.read_csv('D:\\ML_Projects\\Loan_Status_Prediction\\loan_data.csv')
df = df.drop(['Loan_ID'], axis=1)

df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             376 non-null    object 
 1   Married            381 non-null    object 
 2   Dependents         373 non-null    object 
 3   Education          381 non-null    object 
 4   Self_Employed      360 non-null    object 
 5   ApplicantIncome    381 non-null    int64  
 6   CoapplicantIncome  381 non-null    float64
 7   LoanAmount         381 non-null    float64
 8   Loan_Amount_Term   370 non-null    float64
 9   Credit_History     351 non-null    float64
 10  Property_Area      381 non-null    object 
 11  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 35.8+ KB


In [82]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,381.0,381.0,381.0,370.0,351.0
mean,3579.845144,1277.275381,104.986877,340.864865,0.837607
std,1419.813818,2340.818114,28.358464,68.549257,0.369338
min,150.0,0.0,9.0,12.0,0.0
25%,2600.0,0.0,90.0,360.0,1.0
50%,3333.0,983.0,110.0,360.0,1.0
75%,4288.0,2016.0,127.0,360.0,1.0
max,9703.0,33837.0,150.0,480.0,1.0


In [83]:
# Separate features and target
X = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status'].map({'N': 0, 'Y': 1})  # Convert target to binary numeric

In [84]:
df.isnull().sum()


Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [85]:
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
print(missing_percentage)

Gender               1.312336
Married              0.000000
Dependents           2.099738
Education            0.000000
Self_Employed        5.511811
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     2.887139
Credit_History       7.874016
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64


In [86]:
# Define categorical and numerical columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # OneHotEncoder avoids unseen category issues
])

In [87]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [88]:


# Define classifiers
classifiers = [
    (GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42), 'Gradient Boosting'),
    (DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42), 'Decision Tree'),
    (RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42), 'Random Forest'),
    (XGBClassifier(n_estimators=100, max_depth=3, random_state=42), 'XGBoost')
]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Evaluate each classifier
for clf, name in classifiers:
    clf_pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', clf)])
    clf_pipeline.fit(X_train, y_train)
    x_pred = clf_pipeline.predict(X_train)
    accuracy = accuracy_score(y_train, x_pred)
    print(f"{name} Training Accuracy: {accuracy:.4f}")

    y_pred = clf_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    roc_score = roc_auc_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} ROC AUC Score: {roc_score:.4f}")
    print(f"{name} Confusion Matrix:\n{cm}\n")

joblib.dump(clf_pipeline, "D:/ML_Projects/Loan_Status_Prediction/loan_pipeline.pkl")
print("Pipeline saved successfully!")

Gradient Boosting Training Accuracy: 0.9638
Gradient Boosting Accuracy: 0.8701
Gradient Boosting ROC AUC Score: 0.7864
Gradient Boosting Confusion Matrix:
[[13  9]
 [ 1 54]]

Decision Tree Training Accuracy: 0.8586
Decision Tree Accuracy: 0.8442
Decision Tree ROC AUC Score: 0.7409
Decision Tree Confusion Matrix:
[[11 11]
 [ 1 54]]

Random Forest Training Accuracy: 0.8487
Random Forest Accuracy: 0.8442
Random Forest ROC AUC Score: 0.7273
Random Forest Confusion Matrix:
[[10 12]
 [ 0 55]]

XGBoost Training Accuracy: 0.9967
XGBoost Accuracy: 0.8701
XGBoost ROC AUC Score: 0.7727
XGBoost Confusion Matrix:
[[12 10]
 [ 0 55]]

Pipeline saved successfully!


In [90]:
import joblib
import pandas as pd

# Load the saved pipeline
pipeline = joblib.load("D:/ML_Projects/Loan_Status_Prediction/loan_pipeline.pkl")

# Sample data for 17 rows
sample_data = {
    'Gender': ['Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male',
               'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male'],
    'Married': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes',
                'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No'],
    'Dependents': ['1', '1', '1', '2', '1', '2', '0', '0', '2', '1',
                   '1', '0', '2', '0', '0', '3+', '0'],
    'Education': ['Graduate', 'Graduate', 'Graduate', 'Graduate', 'Not Graduate', 'Graduate', 'Not Graduate',
                  'Graduate', 'Graduate', 'Not Graduate', 'Graduate', 'Graduate', 'Graduate', 'Not Graduate',
                  'Not Graduate', 'Graduate', 'Graduate'],
    'Self_Employed': ['No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No',
                      'No', 'No', 'No', 'Yes', 'No', 'No', 'No'],
    'ApplicantIncome': [3155, 3812, 3315, 5819, 2510, 6250, 3406, 6050, 9703, 6608,
                        2882, 1809, 3427, 2583, 6045, 5250, 2060],
    'CoapplicantIncome': [1779.0, 0.0, 0.0, 5000.0, 1983.0, 1300.0, 4417.0, 4333.0, 0.0, 0.0,
                          1843.0, 1868.0, 0.0, 2167.0, 0.0, 0.0, 2209.0],
    'LoanAmount': [140.0, 112.0, 96.0, 120.0, 140.0, 108.0, 123.0, 120.0, 112.0, 137.0,
                   123.0, 90.0, 138.0, 104.0, 115.0, 94.0, 134.0],
    'Loan_Amount_Term': [360.0, 360.0, 360.0, 360.0, 180.0, 360.0, 360.0, 180.0, 360.0, 180.0,
                         480.0, 360.0, 360.0, 360.0, 360.0, 360.0, 360.0],
    'Credit_History': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                       1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
    'Property_Area': ['Semiurban', 'Rural', 'Semiurban', 'Rural', 'Urban', 'Rural', 'Semiurban', 'Urban',
                      'Urban', 'Urban', 'Semiurban', 'Urban', 'Urban', 'Rural', 'Rural', 'Urban', 'Semiurban']
}

# Convert to DataFrame
sample_df = pd.DataFrame(sample_data)
print(sample_df['ApplicantIncome'])
# Make predictions
predictions = pipeline.predict(sample_df)

# Output predictions
for i, prediction in enumerate(predictions):
    print(f"Row {i+1} Predicted Loan Status: {'Approved' if prediction == 1 else 'Not Approved'}")


0     3155
1     3812
2     3315
3     5819
4     2510
5     6250
6     3406
7     6050
8     9703
9     6608
10    2882
11    1809
12    3427
13    2583
14    6045
15    5250
16    2060
Name: ApplicantIncome, dtype: int64
Row 1 Predicted Loan Status: Approved
Row 2 Predicted Loan Status: Approved
Row 3 Predicted Loan Status: Approved
Row 4 Predicted Loan Status: Approved
Row 5 Predicted Loan Status: Not Approved
Row 6 Predicted Loan Status: Approved
Row 7 Predicted Loan Status: Approved
Row 8 Predicted Loan Status: Not Approved
Row 9 Predicted Loan Status: Approved
Row 10 Predicted Loan Status: Approved
Row 11 Predicted Loan Status: Approved
Row 12 Predicted Loan Status: Approved
Row 13 Predicted Loan Status: Not Approved
Row 14 Predicted Loan Status: Approved
Row 15 Predicted Loan Status: Not Approved
Row 16 Predicted Loan Status: Not Approved
Row 17 Predicted Loan Status: Approved


In [97]:
import joblib
import pandas as pd

# Load the saved pipeline
pipeline = joblib.load("D:/ML_Projects/Loan_Status_Prediction/loan_pipeline.pkl")

# Sample data
sample_data = {
    'Gender': ['Male', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male',
               'Male', 'Male', 'Female', 'Male', 'Male', 'Male', 'Male'],
    'Married': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes',
                'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No'],
    'Dependents': ['1', '1', '1', '2', '1', '2', '0', '0', '2', '1',
                   '1', '0', '2', '0', '0', '3+', '0'],
    'Education': ['Graduate', 'Graduate', 'Graduate', 'Graduate', 'Not Graduate', 'Graduate', 'Not Graduate',
                  'Graduate', 'Graduate', 'Not Graduate', 'Graduate', 'Graduate', 'Graduate', 'Not Graduate',
                  'Not Graduate', 'Graduate', 'Graduate'],
    'Self_Employed': ['No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No',
                      'No', 'No', 'No', 'Yes', 'No', 'No', 'No'],
    'ApplicantIncome': [3155, 3812, 3315, 5819, 2510, 6250, 3406, 6050, 9703, 6608,
                        2882, 1809, 3427, 2583, 6045, 5250, 2060],
    'CoapplicantIncome': [1779.0, 0.0, 0.0, 5000.0, 1983.0, 1300.0, 4417.0, 4333.0, 0.0, 0.0,
                          1843.0, 1868.0, 0.0, 2167.0, 0.0, 0.0, 2209.0],
    'LoanAmount': [140.0, 112.0, 96.0, 120.0, 140.0, 108.0, 123.0, 120.0, 112.0, 137.0,
                   123.0, 90.0, 138.0, 104.0, 115.0, 94.0, 134.0],
    'Loan_Amount_Term': [360.0, 360.0, 360.0, 360.0, 180.0, 360.0, 360.0, 180.0, 360.0, 180.0,
                         480.0, 360.0, 360.0, 360.0, 360.0, 360.0, 360.0],
    'Credit_History': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                       1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0],
    'Property_Area': ['Semiurban', 'Rural', 'Semiurban', 'Rural', 'Urban', 'Rural', 'Semiurban', 'Urban',
                      'Urban', 'Urban', 'Semiurban', 'Urban', 'Urban', 'Rural', 'Rural', 'Urban', 'Semiurban']
}

# Convert to DataFrame
sample_df = pd.DataFrame(sample_data)

# Make predictions
predictions = pipeline.predict(sample_df)

# Map predictions to labels
sample_df["Predicted_Loan_Status"] = ["Approved" if pred == 1 else "Not Approved" for pred in predictions]

print(sample_df[["ApplicantIncome", "Predicted_Loan_Status"]])



    ApplicantIncome Predicted_Loan_Status
0              3155              Approved
1              3812              Approved
2              3315              Approved
3              5819              Approved
4              2510          Not Approved
5              6250              Approved
6              3406              Approved
7              6050          Not Approved
8              9703              Approved
9              6608              Approved
10             2882              Approved
11             1809              Approved
12             3427          Not Approved
13             2583              Approved
14             6045          Not Approved
15             5250          Not Approved
16             2060              Approved
