# READ THE DATASET

In [119]:
import pandas as pd

data = pd.read_csv('dataset/train_u6lujuX_CVtuZ9i.csv')
print(data.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

Features

# Loan_ID : Unique Loan ID

# Gender : Male/ Female

# Married : Applicant married (Y/N)

# Dependents : Number of dependents

# Education : Applicant Education (Graduate/ Under Graduate)

# Self_Employed : Self employed (Y/N)

# ApplicantIncome : Applicant income

# CoapplicantIncome : Coapplicant income

# LoanAmount : Loan amount in thousands of dollars

# Loan_Amount_Term : Term of loan in months

# Credit_History : Credit history meets guidelines yes or no

# Property_Area : Urban/ Semi Urban/ Rural

# Loan_Status : Loan approved (Y/N) this is the target variable

In [120]:
data.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


# data.shape --> (Num of Row, Num of Col)

In [121]:
data.shape

(614, 13)

In [122]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# NULL VALUE COUNT IN COLUMNWISE 

In [123]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# NULL VALUE PARCENTAGE IN COLUMNWISE

In [124]:
data.isnull().sum()*100/len(data)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

# DROP UNNECESSARY COLUMN

In [125]:
data=data.drop('Loan_ID',axis=1)
data.head(1)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y


# DROP ALL THE NULL VALUE CONTAINS ROW 

In [126]:
columns = ["Gender", "Dependents", "LoanAmount", "Loan_Amount_Term"]
data = data.dropna(subset=columns, axis=0)
data.isnull().sum()


Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        30
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [127]:
data.isnull().sum()*100/len(data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        5.424955
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.679928
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [128]:
data.shape

(553, 12)

# REASSING VALUE IN THE NULL VALUE

In [129]:
data['Self_Employed'].mode()
# most frequent value in Self_Employed column 

0    No
Name: Self_Employed, dtype: object

In [130]:
data['Self_Employed']=data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

In [131]:
data.isnull().sum()


Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       48
Property_Area         0
Loan_Status           0
dtype: int64

In [132]:
data['Credit_History'].unique()

array([ 1.,  0., nan])

In [133]:
data['Credit_History'].mode()

0    1.0
Name: Credit_History, dtype: float64

In [134]:
data['Credit_History']=data['Credit_History'].fillna(data['Credit_History'].mode()[0])

In [135]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# RANDOM 5 SAMPLE

In [136]:
data.sample(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
31,Male,No,0,Graduate,No,3167,0.0,74.0,360.0,1.0,Urban,N
205,Female,No,0,Not Graduate,No,4408,0.0,120.0,360.0,1.0,Semiurban,Y
77,Male,Yes,1,Graduate,Yes,1000,3022.0,110.0,360.0,1.0,Urban,N
409,Male,Yes,3+,Graduate,No,81000,0.0,360.0,360.0,0.0,Rural,N
180,Male,Yes,1,Graduate,No,6400,7250.0,180.0,360.0,0.0,Urban,N


In [137]:
data["Dependents"].unique()

array(['1', '0', '2', '3+'], dtype=object)

# DATA REPLACE

In [138]:
data["Dependents"]=data["Dependents"].replace(to_replace="3+", value="4")

In [139]:
data["Dependents"].unique()

array(['1', '0', '2', '4'], dtype=object)

# DATA UPDATE IN NUMBER

In [140]:
data["Gender"].unique()

array(['Male', 'Female'], dtype=object)

In [141]:
data["Gender"]=data["Gender"].map({'Male': 1, 'Female': 0}).astype(int)

In [142]:
data["Gender"].unique()

array([1, 0])

In [143]:
data["Married"]=data["Married"].map({'Yes': 1, 'No': 0}).astype(int)
data["Education"]=data["Education"].map({'Graduate': 1, 'Not Graduate': 0}).astype(int)
data["Self_Employed"]=data["Self_Employed"].map({'Yes': 1, 'No': 0}).astype(int)
data["Property_Area"]=data["Property_Area"].map({'Urban': 2, 'Rural': 0, 'Semiurban': 1}).astype(int)
data["Loan_Status"]=data["Loan_Status"].map({'Y': 1, 'N': 0}).astype(int)

In [144]:
data.tail()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,2,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,2,1
613,0,0,0,1,1,4583,0.0,133.0,360.0,0.0,1,0


# STORE FEATURE MATRIX

In [145]:
X=data.drop('Loan_Status', axis=1)
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0
610,1,1,4,1,0,4106,0.0,40.0,180.0,1.0,0
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,2
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,2


In [146]:
y=data['Loan_Status']
y

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 553, dtype: int64

# FEATURE SCALLING

In [147]:
cols = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]


In [148]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[cols]= scaler.fit_transform(X[cols])
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,1,1,1,1,0,-0.128694,-0.049699,-0.214368,0.279961,1.0,0
2,1,1,0,1,1,-0.394296,-0.545638,-0.952675,0.279961,1.0,2
3,1,1,0,0,0,-0.464262,0.229842,-0.309634,0.279961,1.0,2
4,1,0,0,1,0,0.109057,-0.545638,-0.059562,0.279961,1.0,2
5,1,1,2,1,1,0.011239,0.834309,1.440866,0.279961,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,-0.411075,-0.545638,-0.893134,0.279961,1.0,0
610,1,1,4,1,0,-0.208727,-0.545638,-1.262287,-2.468292,1.0,0
611,1,1,1,1,0,0.456706,-0.466709,1.274152,0.279961,1.0,2
612,1,1,2,1,0,0.374659,-0.545638,0.488213,0.279961,1.0,2


# SPLIT DATASET AND K-FOLD CROSS VALIDATION

In [149]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

model_df={}
def model_val(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{model} Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred)}")
    score = cross_val_score(model, X, y, cv=5)
    print(f"{model} Cross-Validation Score: {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100, 2)
    

# Logistic Regession

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model_val(model, X, y)

model_df

LogisticRegression() Accuracy: 0.7747747747747747
LogisticRegression() Cross-Validation Score: 0.802964782964783


{LogisticRegression(): np.float64(80.3)}

# SUPPORT VECTOR CLASSIFIER

In [151]:
from sklearn import svm
model = svm.SVC()
model_val(model, X, y)

model_df

SVC() Accuracy: 0.7477477477477478
SVC() Cross-Validation Score: 0.7938902538902539


{LogisticRegression(): np.float64(80.3), SVC(): np.float64(79.39)}

# DECISION TREE CLASSIFIER

In [152]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model_val(model, X, y)

model_df

DecisionTreeClassifier() Accuracy: 0.6666666666666666
DecisionTreeClassifier() Cross-Validation Score: 0.6996887796887796


{LogisticRegression(): np.float64(80.3),
 SVC(): np.float64(79.39),
 DecisionTreeClassifier(): np.float64(69.97)}

# RANDOM FOREST CLASSIFIER

In [153]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model_val(model, X, y)

model_df

RandomForestClassifier() Accuracy: 0.7747747747747747
RandomForestClassifier() Cross-Validation Score: 0.7848648648648648


{LogisticRegression(): np.float64(80.3),
 SVC(): np.float64(79.39),
 DecisionTreeClassifier(): np.float64(69.97),
 RandomForestClassifier(): np.float64(78.49)}

# GRADIENT BOOSTING CLASSIFIER  

In [154]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model_val(model, X, y)

model_df

GradientBoostingClassifier() Accuracy: 0.7477477477477478
GradientBoostingClassifier() Cross-Validation Score: 0.7739885339885341


{LogisticRegression(): np.float64(80.3),
 SVC(): np.float64(79.39),
 DecisionTreeClassifier(): np.float64(69.97),
 RandomForestClassifier(): np.float64(78.49),
 GradientBoostingClassifier(): np.float64(77.4)}

# HYPERPARAMETER TUNING

In [155]:
from sklearn.model_selection import RandomizedSearchCV

# LOGISTIC REGRESSION

In [156]:
log_reg_grid = {
    "C": np.logspace(-4, 4, 20),
    "solver": ["liblinear"],
}

In [157]:
rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions=log_reg_grid, n_iter=10, cv=5, verbose=True)
rs_log_reg.fit(X, y)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


0,1,2
,estimator,LogisticRegression()
,param_distributions,"{'C': array([1.0000...00000000e+04]), 'solver': ['liblinear']}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(206.913808111479)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [158]:
rs_log_reg.best_score_

np.float64(0.802964782964783)

In [159]:
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': np.float64(206.913808111479)}

# SVC

In [161]:
svc_grid = {"C": [0.25, 0.5, 0.75, 1]}
rs_svc = RandomizedSearchCV(svm.SVC(), param_distributions=svc_grid, n_iter=10, cv=5, verbose=True)
rs_svc.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




0,1,2
,estimator,SVC()
,param_distributions,"{'C': [0.25, 0.5, ...]}"
,n_iter,10
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,C,0.75
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [162]:
rs_svc.best_score_

np.float64(0.7975266175266176)

In [163]:
rs_svc.best_params_

{'C': 0.75}