# Loan Prediction Task Implementation

Importing required modules in task

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder


Loading dataset

In [2]:
data1=pd.read_csv("C:/Users/SHIVAM/LoanPredictData/trainloan.csv")
test=pd.read_csv("C:/Users/SHIVAM/LoanPredictData/testloan.csv")
data1.head(40)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


Lets Check if there exist null values in our dataset.

In [3]:
data1.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Therefore, Here we need to know number of uniques in categorical data, so that we could have a glance over predictive analysis.

In [4]:
print(data1["Gender"].value_counts())
print(data1["Married"].value_counts())
print(data1["Dependents"].value_counts())
print(data1["Education"].value_counts())
print(data1["Self_Employed"].value_counts())
print(data1["Credit_History"].value_counts())
print(data1.Loan_Status.value_counts())

Male      489
Female    112
Name: Gender, dtype: int64
Yes    398
No     213
Name: Married, dtype: int64
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
Graduate        480
Not Graduate    134
Name: Education, dtype: int64
No     500
Yes     82
Name: Self_Employed, dtype: int64
1.0    475
0.0     89
Name: Credit_History, dtype: int64
Y    422
N    192
Name: Loan_Status, dtype: int64


Now, I am going to analyse numerical data 

In [5]:
data1.describe()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


After analysis, I came to know that categorical data have many null values and so do numerical data have. So, I am going to handle nulls in below code: 

In [6]:
print(data1.Dependents.value_counts())
print(data1.Dependents.isna().sum())

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
15


In [7]:
data1.Gender.fillna('Not Given', inplace=True)
data1.Married.fillna('Not Given', inplace=True)
data1.Self_Employed.fillna('Not Given', inplace=True)
data1.LoanAmount.fillna(data1.LoanAmount.mean(), inplace=True)
data1.Loan_Amount_Term.fillna(360.0, inplace=True)
data1.Credit_History.fillna(1.0, inplace=True)
data1.Dependents.fillna(0, inplace=True)

data1.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Now we also noticed, that there exist non numeric values in "Dependents" column. I am going to handle them like that:

In [8]:
data1.Dependents.replace(to_replace='3+', value=4)

0      0
1      1
2      0
3      0
4      0
5      2
6      0
7      4
8      2
9      1
10     2
11     2
12     2
13     0
14     2
15     0
16     1
17     0
18     0
19     0
20     0
21     1
22     0
23     2
24     1
25     0
26     0
27     2
28     0
29     2
      ..
584    1
585    1
586    0
587    0
588    0
589    2
590    0
591    2
592    4
593    0
594    0
595    0
596    2
597    0
598    0
599    2
600    4
601    0
602    4
603    0
604    1
605    0
606    1
607    2
608    0
609    0
610    4
611    1
612    2
613    0
Name: Dependents, Length: 614, dtype: object

Now, checking again for nan values.

In [9]:
data1.isna().sum()


Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Now, Preparing features and labels (training data) for ML model implementation

In [10]:
feature=pd.get_dummies(data1.iloc[:,1:12])
labels=data1["Loan_Status"]
feature.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_Not Given,Married_No,Married_Not Given,...,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Not Given,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,146.412162,360.0,1.0,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0,...,0,0,1,0,1,0,0,1,0,0
2,3000,0.0,66.0,360.0,1.0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,1
4,6000,0.0,141.0,360.0,1.0,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1


In [11]:
feature.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_Not Given,Married_No,Married_Not Given,...,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Not Given,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,...,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,5403.459283,1621.245798,146.412162,342.410423,0.855049,0.18241,0.796417,0.021173,0.346906,0.004886,...,0.164495,0.083062,0.781759,0.218241,0.814332,0.052117,0.13355,0.291531,0.379479,0.32899
std,6109.041673,2926.248369,84.037468,64.428629,0.352339,0.386497,0.402991,0.144077,0.476373,0.069786,...,0.371027,0.276201,0.413389,0.413389,0.389155,0.222445,0.340446,0.454838,0.485653,0.470229
min,150.0,0.0,9.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2877.5,0.0,100.25,360.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,3812.5,1188.5,129.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,5795.0,2297.25,164.75,360.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
len(feature)

614

In [13]:
len(labels)

614

In [14]:
len(test)

367

In [15]:
labels=data1.Loan_Status.map(dict(Y=1,N=0))
labels.shape

(614,)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [17]:
x_train, x_test, y_train, y_test = train_test_split(feature, labels, test_size=.3, random_state =3)

In [18]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(429, 24)
(429,)
(185, 24)
(185,)


In [19]:
from sklearn.utils import resample

# LOGISTIC REGRESSION

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
model=LogisticRegression()

In [26]:
model.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [27]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [28]:
pred=model.predict(x_test)
pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [29]:
print(accuracy_score(pred,y_test))
print(f1_score(y_test, pred))
print(confusion_matrix(y_test, pred))


0.8540540540540541
0.9090909090909091
[[ 23  22]
 [  5 135]]
