In [31]:
#Importing pandas

import pandas as pd

In [32]:
#read the data

loandata = pd.read_csv("loan.csv")

print(loandata)

     gender married   ch  income  loanamt status
0      Male      No  1.0    5849      NaN      Y
1      Male     Yes  1.0    4583    128.0      N
2      Male     Yes  1.0    3000     66.0      Y
3      Male     Yes  1.0    2583    120.0      Y
4      Male      No  1.0    6000    141.0      Y
..      ...     ...  ...     ...      ...    ...
609  Female      No  1.0    2900     71.0      Y
610    Male     Yes  1.0    4106     40.0      Y
611    Male     Yes  1.0    8072    253.0      Y
612    Male     Yes  1.0    7583    187.0      Y
613  Female      No  0.0    4583    133.0      N

[614 rows x 6 columns]


Creating a copy of the original data

In [33]:
loanprep = loandata.copy()

Cheaking missing values in the data


In [34]:
loanprep.isnull().sum()

gender     13
married     3
ch         50
income      0
loanamt    22
status      0
dtype: int64

From the above observation in total of 614 records there are max of 50 missing values. Lets clean the data buy droping the missing records

In [35]:
loanprep = loanprep.dropna()

In [36]:
loanprep.isnull().sum()

gender     0
married    0
ch         0
income     0
loanamt    0
status     0
dtype: int64

Now there are no missing value present

Since this data is about granting loan to the customers. Mostly loan will be given irrespective of the gender so droping Gender column for further processing

In [37]:
loanprep = loanprep.drop(["gender"], axis= 1)

In [38]:
loanprep.dtypes

married     object
ch         float64
income       int64
loanamt    float64
status      object
dtype: object

since there are some categorical variable, hot-encoding the variable by creating dummies using get dummies

In [39]:
loanprep = pd.get_dummies(loanprep, drop_first=True)
print(loanprep)


      ch  income  loanamt  married_Yes  status_Y
1    1.0    4583    128.0            1         0
2    1.0    3000     66.0            1         1
3    1.0    2583    120.0            1         1
4    1.0    6000    141.0            0         1
5    1.0    5417    267.0            1         1
..   ...     ...      ...          ...       ...
609  1.0    2900     71.0            0         1
610  1.0    4106     40.0            1         1
611  1.0    8072    253.0            1         1
612  1.0    7583    187.0            1         1
613  0.0    4583    133.0            0         0

[529 rows x 5 columns]


Next Normalising the Income and Loanamt features by importing standardscaler from sklearn

In [40]:
from sklearn.preprocessing import StandardScaler
# creating the object for the class
scaler_ = StandardScaler()

In [41]:
loanprep[["income" ,"loanamt"]]= scaler_.fit_transform(loanprep[["income","loanamt"]])

In [42]:
loanprep

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
1,1.0,-0.128073,-0.194250,1,0
2,1.0,-0.392077,-0.971015,1,1
3,1.0,-0.461621,-0.294478,1,1
4,1.0,0.108246,-0.031380,0,1
5,1.0,0.011017,1.547205,1,1
...,...,...,...,...,...
609,1.0,-0.408754,-0.908372,0,1
610,1.0,-0.207624,-1.296754,1,1
611,1.0,0.453802,1.371807,1,1
612,1.0,0.372249,0.544929,1,1


Now the values of Income and loanamt are normally distributed

Now spliting the dataset into X(independent features) and Y(dependent feature)

In [43]:
Y = loanprep[["status_Y"]]
X = loanprep.iloc[: , :-1]

In [44]:
print(X)
print(Y)

      ch    income   loanamt  married_Yes
1    1.0 -0.128073 -0.194250            1
2    1.0 -0.392077 -0.971015            1
3    1.0 -0.461621 -0.294478            1
4    1.0  0.108246 -0.031380            0
5    1.0  0.011017  1.547205            1
..   ...       ...       ...          ...
609  1.0 -0.408754 -0.908372            0
610  1.0 -0.207624 -1.296754            1
611  1.0  0.453802  1.371807            1
612  1.0  0.372249  0.544929            1
613  0.0 -0.128073 -0.131608            0

[529 rows x 4 columns]
     status_Y
1           0
2           1
3           1
4           1
5           1
..        ...
609         1
610         1
611         1
612         1
613         0

[529 rows x 1 columns]


Importing train_test_split model to split the data into train and test dataset

In [45]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = \
    train_test_split(X, Y, test_size = 0.3, random_state= 1234 , stratify = Y)

In [46]:
x_train

Unnamed: 0,ch,income,loanamt,married_Yes
17,0.0,-0.307022,-0.845730,0
474,1.0,0.030196,0.231717,0
512,1.0,-0.344880,0.056319,1
459,1.0,0.497497,0.206660,1
405,1.0,-0.134077,-0.357120,0
...,...,...,...,...
289,1.0,0.693290,0.544929,0
386,1.0,-0.501147,-0.144136,0
118,1.0,0.036199,0.394587,1
225,1.0,-0.350383,0.331945,1


In [47]:
x_test

Unnamed: 0,ch,income,loanamt,married_Yes
373,0.0,-0.381737,-0.407234,0
464,0.0,-0.197618,-0.570104,0
3,1.0,-0.461621,-0.294478,1
262,1.0,-0.311858,0.144018,0
401,1.0,-0.432936,-0.983543,0
...,...,...,...,...
360,1.0,-0.104725,-0.783088,0
71,1.0,-0.579698,-0.582632,1
96,1.0,-0.478132,-0.081494,1
388,1.0,-0.503315,-0.094023,1


Now Importing logistic regression model.

In [48]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

Fitting the data to logistic regression model

In [49]:
lr.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Predicting the dependent variable using logistic regression

In [51]:
Y_predict = lr.predict(x_test)
print(Y_predict)

[0 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1
 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1
 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 0 1 1 1 1 1 1 1]


Lets check the model prediction accuracy using accuracy score and confusion matrix

In [52]:
from sklearn.metrics import confusion_matrix

In [53]:
cm = confusion_matrix(y_test, Y_predict)

In [54]:
score = lr.score(x_test,y_test)

Also lets check the precision,recall and F1 score

In [55]:
from sklearn.metrics import classification_report

In [56]:
cr = classification_report(y_test, Y_predict)

In [57]:
print(cm)

[[ 29  20]
 [  2 108]]


As we can see out of 159 record the model as predicted 137 records correctly

In [58]:
print(score)

0.8616352201257862


And the accuracy of this model is 0.8616 or 86.16% 

In [59]:
print(cr)

              precision    recall  f1-score   support

           0       0.94      0.59      0.72        49
           1       0.84      0.98      0.91       110

    accuracy                           0.86       159
   macro avg       0.89      0.79      0.82       159
weighted avg       0.87      0.86      0.85       159



The precising on true values is 84% and recall is 98% and F1 score for this model is 91%