In [177]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [113]:
df = pd.read_csv("Loan_prediction_dataset.csv")

In [123]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         614 non-null    int64  
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    int64  
 8   LoanAmount         614 non-null    int64  
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     614 non-null    int64  
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(1), int64(5), object(7)
memory usage: 62.5+ KB


In [146]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents            0
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [117]:
df['Dependents'] = df['Dependents'].str.replace('+', '', regex=False)

In [120]:
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')  
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].median())  

In [122]:
df['Dependents'] = df['Dependents'].astype(int)

In [57]:
df['LoanAmount'] = pd.to_numeric(df['LoanAmount'], errors='coerce')

array([0, 1, 2, 3])

In [127]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

In [128]:
df['LoanAmount'] = df['LoanAmount'].astype(int)

In [131]:
df['CoapplicantIncome'] = df['CoapplicantIncome'].astype(int)

In [133]:
df['Credit_History'] = pd.to_numeric(df['Credit_History'], errors='coerce')

In [138]:
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].median())

In [139]:
df['Loan_Amount_Term'] = pd.to_numeric(df['Loan_Amount_Term'], errors='coerce')

In [147]:
df = df.drop(columns='Loan_ID', axis=1)

In [148]:
df = df.dropna()

In [None]:
df['Credit_History'] = df['Credit_History'].astype(int)

In [None]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())

In [None]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype(int)

In [149]:
le = LabelEncoder()

In [150]:
for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

In [185]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0,128,360,1,2,1
1,1,1,1,0,0,4583,1508,128,360,1,0,0
2,1,1,0,0,1,3000,0,66,360,1,2,1
3,1,1,0,1,0,2583,2358,120,360,1,2,1
4,1,0,0,0,0,6000,0,141,360,1,2,1


In [152]:
X = df.drop(columns='Loan_Status', axis=1)
y = df['Loan_Status']

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [154]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [156]:
pipeline = Pipeline([
    ('ss', StandardScaler()),
    ('rfc', RandomForestClassifier())
])

In [157]:
params = {
    'rfc__n_estimators': range(25, 100, 10),
    'rfc__max_depth': range(10, 50, 10)
}

In [158]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

In [159]:
model = GridSearchCV(
    pipeline,
    param_grid = params,
    cv = cv,
    n_jobs = 5, 
    verbose = 1
)

In [160]:
model.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [161]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.74      0.50      0.60        34
           1       0.81      0.93      0.87        80

    accuracy                           0.80       114
   macro avg       0.78      0.71      0.73       114
weighted avg       0.79      0.80      0.79       114



In [162]:
pipeline1 = Pipeline([
    ('ss', StandardScaler()),
    ('dtc', DecisionTreeClassifier())
])

In [163]:
params1 = {
    'dtc__max_depth': range(10, 50, 10),
    'dtc__min_samples_split': range(25, 100, 10),
    'dtc__min_samples_leaf': range(25, 100, 10)
}

In [164]:
model1 = GridSearchCV(
    pipeline1,
    param_grid = params1,
    cv = cv,
    n_jobs = 5, 
    verbose = 1
)

In [165]:
model1.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [174]:
model1.score(X_train_smote, y_train_smote)

0.8198051948051948

In [166]:
print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.44      0.58        34
           1       0.80      0.96      0.88        80

    accuracy                           0.81       114
   macro avg       0.82      0.70      0.73       114
weighted avg       0.81      0.81      0.79       114



In [170]:
pipeline2 = Pipeline([
    ('ss', StandardScaler()),
    ('svc', SVC())
])

In [171]:
params2 = {
    'svc__C': range(10, 50, 10),
    'svc__kernel': ['rbf']
}

In [172]:
model2 = GridSearchCV(
    pipeline2,
    param_grid = params2,
    cv = cv,
    n_jobs = 5, 
    verbose = 1
)

In [173]:
model2.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [175]:
model2.score(X_train_smote, y_train_smote)

0.900974025974026

In [176]:
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.65      0.44      0.53        34
           1       0.79      0.90      0.84        80

    accuracy                           0.76       114
   macro avg       0.72      0.67      0.68       114
weighted avg       0.75      0.76      0.75       114



In [178]:
pipeline3 = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', XGBClassifier())
])

In [180]:
params3 = {
    'xgb__n_estimators': range(25, 100, 10),
    'xgb__max_depth': range(10, 50, 10)
}

In [181]:
model3 = GridSearchCV(
    pipeline3,
    param_grid = params3,
    cv = cv,
    n_jobs = 5, 
    verbose = 1
)

In [182]:
model3.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [183]:
model3.score(X_train_smote, y_train_smote)

1.0

In [184]:
print(classification_report(y_test, model3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.68      0.56      0.61        34
           1       0.83      0.89      0.86        80

    accuracy                           0.79       114
   macro avg       0.75      0.72      0.73       114
weighted avg       0.78      0.79      0.78       114

