In [331]:
#Import Libraries
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import matplotlib.pyplot as plt
import sklearn.linear_model as LinearRegression
import seaborn as sns
sns.set()

In [332]:
#get data


raw_data = pd.read_csv('01Exercise1.csv')

In [333]:
#Copy the data
data = raw_data.copy()

In [334]:
data.shape

(614, 6)

In [335]:
# Check for the data types, non null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gender   601 non-null    object 
 1   married  611 non-null    object 
 2   ch       564 non-null    float64
 3   income   614 non-null    int64  
 4   loanamt  592 non-null    float64
 5   status   614 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 28.9+ KB


In [336]:
# Check for the null values
data.isnull().sum(axis = 0)

gender     13
married     3
ch         50
income      0
loanamt    22
status      0
dtype: int64

In [337]:
data.ndim

2

In [338]:
#Drop null values
data1 = data.dropna(axis = 0)

In [339]:
data1.dtypes

gender      object
married     object
ch         float64
income       int64
loanamt    float64
status      object
dtype: object

In [340]:
data1.isnull().sum(axis = 0)

gender     0
married    0
ch         0
income     0
loanamt    0
status     0
dtype: int64

In [341]:
# Documentation on datatype change: https://pbpython.com/pandas_dtypes.html
#data1["ch"] = data1['ch'].astype('int')
#data1.dtypes

In [342]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529 entries, 1 to 613
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gender   529 non-null    object 
 1   married  529 non-null    object 
 2   ch       529 non-null    float64
 3   income   529 non-null    int64  
 4   loanamt  529 non-null    float64
 5   status   529 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 28.9+ KB


In [343]:
# drop gender as we don't do analysis based on gender
data2 = data1.drop(['gender'], axis = 1)

In [344]:
data2

Unnamed: 0,married,ch,income,loanamt,status
1,Yes,1.0,4583,128.0,N
2,Yes,1.0,3000,66.0,Y
3,Yes,1.0,2583,120.0,Y
4,No,1.0,6000,141.0,Y
5,Yes,1.0,5417,267.0,Y
6,Yes,1.0,2333,95.0,Y
7,Yes,0.0,3036,158.0,N
8,Yes,1.0,4006,168.0,Y
9,Yes,1.0,12841,349.0,N
10,Yes,1.0,3200,70.0,Y


In [345]:
# convert the catagorical variables to dummies

data3 = pd.get_dummies(data2,drop_first = True)

In [346]:
data3

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
1,1.0,4583,128.0,1,0
2,1.0,3000,66.0,1,1
3,1.0,2583,120.0,1,1
4,1.0,6000,141.0,0,1
5,1.0,5417,267.0,1,1
6,1.0,2333,95.0,1,1
7,0.0,3036,158.0,1,0
8,1.0,4006,168.0,1,1
9,1.0,12841,349.0,1,0
10,1.0,3200,70.0,1,1


In [347]:
# data normalization
#
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#data3['income'] = scaler.fit_transform(data3[['income']])
#data3['loanamt'] = scaler.fit_transform(data3[['loanamt']]
#data3                                        

In [348]:
# Standardization of the data 
from sklearn.preprocessing import StandardScaler
scaler_ = StandardScaler()

In [349]:
data3['income'] = scaler_.fit_transform(data3[['income']])
data3['loanamt'] = scaler_.fit_transform(data3[['loanamt']])

In [350]:
data3

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
1,1.0,-0.128073,-0.19425,1,0
2,1.0,-0.392077,-0.971015,1,1
3,1.0,-0.461621,-0.294478,1,1
4,1.0,0.108246,-0.03138,0,1
5,1.0,0.011017,1.547205,1,1
6,1.0,-0.503315,-0.607689,1,1
7,0.0,-0.386073,0.181603,1,0
8,1.0,-0.224302,0.306888,1,1
9,1.0,1.249148,2.574538,1,0
10,1.0,-0.358722,-0.920901,1,1


In [351]:
# Split the model into training and testing data

# create the x ( Independant) and y ( Dependent ) variables
y = data3[['status_Y']]
x = data3.drop(['status_Y'], axis = 1)

In [352]:
from sklearn.model_selection import train_test_split
# scaled_inputs are inputs/independent variables
# targets are dependent variables
# test_size = 80/20 rule
#random_state can be any number
# suffix with test will be used to test the model and the suffix with train will be used to train the model
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state = 365)

In [353]:
y_train.shape

(370, 1)

In [354]:
y_test.shape

(159, 1)

In [355]:
# build the logistic regresssion

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [356]:
lr.fit(x_train,y_train)

  return f(**kwargs)


LogisticRegression()

In [357]:
# Predicted y values
y_predict = lr.predict(x_test)

In [358]:
# Create a confustion matrix to understand the true-positive, true-negetive, false-positive and false Nagetive


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_predict)
cm

# here the 22 says true_negetive(Loan Rejected)  and 112 says true_positive (Loan approved), 23 of them False positive (in y_test, they are negetive but we have proved positive in prediction) and 2 are false negetive ( In y_test they are positive but proved them negetive in prediction)
# here, false positive and false negetive are incorrect predction done by our predictive model

array([[ 22,  23],
       [  2, 112]], dtype=int64)

In [359]:
# Accuracy
lr.score(x_test,y_test)

0.8427672955974843

In [360]:
#Precision, Recall, Accuracy, ROC, and AUC
#Precision — Also called Positive predictive value
#The ratio of correct positive predictions to the total predicted positives.

#Precision = TP/(TP+TP)

#Recall — Also called Sensitivity, Probability of Detection, True Positive Rate
#The ratio of correct positive predictions to the total positives examples.

# Recall = TP(TP+FN)

#Accuracy -  is defined as the ratio of correctly predicted examples by the total examples.
#Accuracy = TP +TN/(TN+FP+FN+TP)
#i.e Accuracy = Correct predictions/total predictions

#A ROC curve (receiver operating characteristic curve) - graph shows the performance of a classification model
#at all classification thresholds.
#i.e which defines TPR( Recall) and FPR in the form of curve with the thershold of classification
#(Using thresholds: Say, if you want to compute TPR and FPR for the threshold equal to 0.7, you apply the model to each example,
# get the score,and, if the score if higher than or equal to 0.7, you predict the positive class; otherwise, you predict the negative class)
 
#AUC ( Area under ROC) - It provides an aggregate measure of performance across all possible classification thresholds.

#The higher the area under the ROC curve (AUC), the better the classifier. A perfect classifier would have an AUC of 1. Usually, if your model behaves 
#well, you obtain a good classifier by selecting the value of the threshold that gives TPR close to 1 while keeping FPR near 0.

In [369]:
# Finding precision and recall
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
print(f"precision: {precision}")
print(f"Recall: {recall}")


precision: 0.8296296296296296
Recall: 0.9824561403508771
