## Logistic Regression and Decision Tree Classification

In [2]:
# Importing libraries

import pandas as pd
import numpy as np

### Loading Data 

In [4]:
loan_data = pd.read_csv("loan-train.csv")

In [5]:
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


### Checking Empty Values

In [7]:
loan_data.apply(lambda x: sum(x.isnull()), axis=0)

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
loan_data.count()

Loan_ID              614
Gender               601
Married              611
Dependents           599
Education            614
Self_Employed        582
ApplicantIncome      614
CoapplicantIncome    614
LoanAmount           592
Loan_Amount_Term     600
Credit_History       564
Property_Area        614
Loan_Status          614
dtype: int64

In [9]:
loan_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Filling Empty or Null Values

In [11]:
loan_data["Gender"].value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

The higher value is Male here, so the missing value is replaced by Male here.

In [13]:
loan_data.Gender = loan_data.Gender.fillna("Male")

In [14]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Data is updated correctly. Gender does not show any missing values.

In [16]:
loan_data.Married.value_counts()

Married
Yes    398
No     213
Name: count, dtype: int64

In [17]:
loan_data.Married = loan_data.Married.fillna("Yes")

In [18]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [19]:
loan_data.Dependents.value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [20]:
loan_data.Dependents = loan_data.Dependents.fillna("0")

In [21]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [22]:
loan_data.Self_Employed.value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [23]:
loan_data.Self_Employed = loan_data.Self_Employed.fillna('No')

In [24]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

#### Filling Loan Amount Data

loan_data.LoanAmount

In [27]:
loan_data.LoanAmount.min()

9.0

In [28]:
loan_data.LoanAmount.max()

700.0

In [29]:
loan_data.LoanAmount.mean()

146.41216216216216

In [30]:
loan_data.LoanAmount = loan_data.LoanAmount.fillna(loan_data.LoanAmount.mean())

#### Filling Loan Amount Term Data

In [32]:
loan_data.Loan_Amount_Term.isnull().sum()

14

In [33]:
loan_data.Loan_Amount_Term.value_counts()

Loan_Amount_Term
360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [34]:
loan_data.Loan_Amount_Term = loan_data.Loan_Amount_Term.fillna(360.0)

#### Working with Credit History

In [36]:
loan_data.Credit_History.isnull().sum()

50

In [37]:
loan_data.Credit_History.value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [38]:
loan_data.Credit_History = loan_data.Credit_History.fillna(1.0)

### Rechecking the Dataset for Missing Values

In [40]:
loan_data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Now we have good dataset for Machine Lerning 

In [42]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
230,LP001765,Male,Yes,1,Graduate,No,2491,2054.0,104.0,360.0,1.0,Semiurban,Y
201,LP001677,Male,No,2,Graduate,No,4923,0.0,166.0,360.0,0.0,Semiurban,Y
376,LP002219,Male,Yes,3+,Graduate,No,8750,4996.0,130.0,360.0,1.0,Rural,Y
587,LP002917,Female,No,0,Not Graduate,No,2165,0.0,70.0,360.0,1.0,Semiurban,Y
70,LP001243,Male,Yes,0,Graduate,No,3208,3066.0,172.0,360.0,1.0,Urban,Y
65,LP001225,Male,Yes,0,Graduate,No,5726,4595.0,258.0,360.0,1.0,Semiurban,N
533,LP002729,Male,No,1,Graduate,No,11250,0.0,196.0,360.0,1.0,Semiurban,N
347,LP002131,Male,Yes,2,Not Graduate,No,3083,2168.0,126.0,360.0,1.0,Urban,Y
311,LP002004,Male,No,0,Not Graduate,No,2927,2405.0,111.0,360.0,1.0,Semiurban,Y
272,LP001892,Male,No,0,Graduate,No,2833,1857.0,126.0,360.0,1.0,Rural,Y


In [43]:
loan_data.shape

(614, 13)

In [44]:
loan_data.head(1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y


### Splitting training data

In [46]:
# Loan_ID is not selected as it does not provide value to the ML process

all_ml_columns = loan_data.iloc[ : , 1 : 12].values
all_status_column = loan_data.iloc[:, 12].values

In [47]:
all_ml_columns[0]

array(['Male', 'No', '0', 'Graduate', 'No', 5849, 0.0, 146.41216216216216,
       360.0, 1.0, 'Urban'], dtype=object)

In [48]:
all_status_column

array(['Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'N

#### Generating Training and Test Dataset

In [50]:
from sklearn.model_selection import train_test_split
# test_size = 33% or 1/3

In [51]:
x_train, x_test, y_train, y_test = train_test_split(all_ml_columns, all_status_column, test_size = 1/3, random_state= 0)

In [52]:
x_train

array([['Male', 'Yes', '3+', ..., 360.0, 1.0, 'Rural'],
       ['Male', 'Yes', '0', ..., 360.0, 1.0, 'Rural'],
       ['Male', 'Yes', '3+', ..., 180.0, 1.0, 'Rural'],
       ...,
       ['Male', 'Yes', '3+', ..., 360.0, 1.0, 'Semiurban'],
       ['Male', 'Yes', '0', ..., 360.0, 1.0, 'Urban'],
       ['Female', 'Yes', '0', ..., 360.0, 1.0, 'Semiurban']], dtype=object)

In [53]:
x_train.shape

(409, 11)

In [54]:
x_test

array([['Male', 'No', '0', ..., 360.0, 1.0, 'Semiurban'],
       ['Female', 'No', '0', ..., 360.0, 1.0, 'Semiurban'],
       ['Male', 'Yes', '0', ..., 360.0, 1.0, 'Urban'],
       ...,
       ['Male', 'Yes', '0', ..., 180.0, 1.0, 'Rural'],
       ['Male', 'Yes', '2', ..., 180.0, 0.0, 'Urban'],
       ['Male', 'Yes', '0', ..., 360.0, 1.0, 'Rural']], dtype=object)

In [55]:
x_test.shape

(205, 11)

In [56]:
y_train

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [57]:
y_train.shape

(409,)

In [58]:
y_test

array(['Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [59]:
y_test.shape

(205,)

### Logistic Regression (A Quick test to show why you need encoding)

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
classifier = LogisticRegression( random_state = 0 )

In [63]:
classifier.fit(x_train,y_train)

ValueError: could not convert string to float: 'Male'

#### Encoding categorical data and independent vars

In [116]:
x_train[0]

array(['Male', 'Yes', '3+', 'Not Graduate', 'Yes', 5703, 0.0, 130.0,
       360.0, 1.0, 'Rural'], dtype=object)

We need to work on 6 columns, or we can say encoding 6 columns.

In [119]:
from sklearn.preprocessing import LabelEncoder

In [121]:
labelencoder_X = LabelEncoder()

In [123]:
for i in range(0, 5):
    x_train[: ,i] = labelencoder_X.fit_transform(x_train[:, i])

In [125]:
x_train[:,10] = labelencoder_X.fit_transform(x_train[:,10])

In [127]:
x_train[120]

array([1, 1, 2, 0, 0, 3510, 4416.0, 243.0, 360.0, 1.0, 0], dtype=object)

In [129]:
x_train

array([[1, 1, 3, ..., 360.0, 1.0, 0],
       [1, 1, 0, ..., 360.0, 1.0, 0],
       [1, 1, 3, ..., 180.0, 1.0, 0],
       ...,
       [1, 1, 3, ..., 360.0, 1.0, 1],
       [1, 1, 0, ..., 360.0, 1.0, 2],
       [0, 1, 0, ..., 360.0, 1.0, 1]], dtype=object)

#### Dependent Variable or Y Encoding

##### y=f(x)

In [133]:
labelencoder_y = LabelEncoder()

In [135]:
y_train = labelencoder_y.fit_transform(y_train)

In [136]:
y_train

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,

In [137]:
y_train[120]

1

#### Test Dataeset Encoding

In [142]:
for i in range(0, 5):
    x_test[:, i] = labelencoder_X.fit_transform(x_test[:, i])
x_test[:,10] = labelencoder_X.fit_transform(x_test[:, 10])

In [144]:
x_test

array([[1, 0, 0, ..., 360.0, 1.0, 1],
       [0, 0, 0, ..., 360.0, 1.0, 1],
       [1, 1, 0, ..., 360.0, 1.0, 2],
       ...,
       [1, 1, 0, ..., 180.0, 1.0, 0],
       [1, 1, 2, ..., 180.0, 0.0, 2],
       [1, 1, 0, ..., 360.0, 1.0, 0]], dtype=object)

In [146]:
x_test[40]

array([0, 0, 0, 0, 0, 3244, 0.0, 80.0, 360.0, 1.0, 2], dtype=object)

In [148]:
y_test = labelencoder_y.fit_transform(y_test)

In [150]:
y_test[100]

1

### Re-applying Logistics Regression

In [153]:
lr_classifier = LogisticRegression(random_state =0)

In [155]:
lr_classifier.fit(x_train,y_train,)

In [157]:
# Predicting the Test Set Results
y_pred_logistic_regression = lr_classifier.predict(x_test)

In [159]:
y_pred_logistic_regression

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1])

### Model Accuracy

In [162]:
from sklearn import metrics

In [164]:
print(' Logistic Regression Accuracy (%) is: ' ,metrics.accuracy_score(y_pred_logistic_regression, y_test))

 Logistic Regression Accuracy (%) is:  0.824390243902439


### Confusion Matrix

In [167]:
from sklearn.metrics import confusion_matrix

In [169]:
lr_confusion_matrix = confusion_matrix(y_test, y_pred_logistic_regression)

In [171]:
lr_confusion_matrix

array([[ 26,  34],
       [  2, 143]], dtype=int64)

### Using Decision Tree Algorithm

In [174]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier

In [176]:
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

In [178]:
dt_classifier.fit(x_train,y_train)

In [180]:
y_pred_decision_tree = dt_classifier.predict(x_test)

In [182]:
y_pred_decision_tree

array([0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1])

### Decision Tree Accuracy

In [185]:
print('Decision Tree Accuracy (%) is:', metrics.accuracy_score(y_pred_decision_tree, y_test))

Decision Tree Accuracy (%) is: 0.7219512195121951


### Confusion Matrics

In [188]:
dt_confusion_matrics = confusion_matrix(y_test, y_pred_decision_tree)

In [190]:
dt_confusion_matrics

array([[ 32,  28],
       [ 29, 116]], dtype=int64)

### Improving Decision Tree Accuracy

#### Setting max_depth parameters in the decision tree classifier

In [194]:
dt_classifier_improved = DecisionTreeClassifier(criterion = 'entropy', random_state=0, max_depth=5)

In [196]:
dt_classifier_improved.fit(x_train, y_train)

In [198]:
y_pred_decision_tree_improved = dt_classifier_improved.predict(x_test)

In [200]:
print('Improved Decision Tree Accuracy (%) is:', metrics.accuracy_score(y_pred_decision_tree_improved, y_test))

Improved Decision Tree Accuracy (%) is: 0.8048780487804879


### Prediction Using a Customer test data

#### Getting a row from X_test Dataset(or Building your own)

In [204]:
x_test[100]

array([1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0], dtype=object)

In [206]:
y_test[100]

1

In [208]:
sample_data = [[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0]]

In [210]:
type(sample_data)

list

In [212]:
# We have already imported the NumPy library at the start; we are importing it again for clarity.
import numpy as np

In [214]:
sample_data_item = np.array(sample_data, dtype=object)

In [216]:
sample_data_item

array([[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0]], dtype=object)

In [218]:
pred_result = dt_classifier.predict(sample_data_item)

In [220]:
pred_result

array([1])

In [222]:
pred_result[0]

1

In [224]:
dt_classifier_improved.predict(sample_data_item)

array([1])

### Saving the Model to Disk

In [227]:
import pickle

In [229]:
filename = 'loan_prediction_model1.bin'
pickle.dump(dt_classifier_improved, open(filename, 'wb'))

In [231]:
!dir loan_prediction_model1.bin

 Volume in drive C is Windows
 Volume Serial Number is 3A2B-0237

 Directory of C:\Users\tanishka\Loan Prediction scikit-learn

03/07/2025  22:55             3,147 Loan_Prediction_Model1.bin
               1 File(s)          3,147 bytes
               0 Dir(s)  131,380,817,920 bytes free


### Reading Model from the Disk and Performing the Prediction

In [234]:
loaded_loan_prediction_model1 = pickle.load(open(filename, 'rb'))

In [236]:
loaded_loan_prediction_model1

In [238]:
sample_data = [[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0]]

In [248]:
result = loaded_loan_prediction_model1.predict(sample_data)

In [250]:
result

array([1])