## Adding Feature Engineering

In [2]:
# Importing libraries

import pandas as pd
import numpy as np

### Loading Data 

In [5]:
loan_data = pd.read_csv("loan-train.csv")

In [7]:
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


### Checking Empty Values

In [10]:
loan_data.apply(lambda x: sum(x.isnull()), axis=0)

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
loan_data.count()

Loan_ID              614
Gender               601
Married              611
Dependents           599
Education            614
Self_Employed        582
ApplicantIncome      614
CoapplicantIncome    614
LoanAmount           592
Loan_Amount_Term     600
Credit_History       564
Property_Area        614
Loan_Status          614
dtype: int64

In [14]:
loan_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Filling Empty or Null Values

In [17]:
loan_data["Gender"].value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

The higher value is Male here, so the missing value is replaced by Male here.

In [20]:
loan_data.Gender = loan_data.Gender.fillna("Male")

In [22]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Data is updated correctly. Gender does not show any missing values.

In [25]:
loan_data.Married.value_counts()

Married
Yes    398
No     213
Name: count, dtype: int64

In [27]:
loan_data.Married = loan_data.Married.fillna("Yes")

In [29]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [31]:
loan_data.Dependents.value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [33]:
loan_data.Dependents = loan_data.Dependents.fillna("0")

In [35]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [37]:
loan_data.Self_Employed.value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [39]:
loan_data.Self_Employed = loan_data.Self_Employed.fillna('No')

In [41]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

#### Filling Loan Amount Data

loan_data.LoanAmount

In [45]:
loan_data.LoanAmount.min()

9.0

In [47]:
loan_data.LoanAmount.max()

700.0

In [49]:
loan_data.LoanAmount.mean()

146.41216216216216

In [51]:
loan_data.LoanAmount = loan_data.LoanAmount.fillna(loan_data.LoanAmount.mean())

#### Filling Loan Amount Term Data

In [54]:
loan_data.Loan_Amount_Term.isnull().sum()

14

In [56]:
loan_data.Loan_Amount_Term.value_counts()

Loan_Amount_Term
360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [58]:
loan_data.Loan_Amount_Term = loan_data.Loan_Amount_Term.fillna(360.0)

#### Working with Credit History

In [61]:
loan_data.Credit_History.isnull().sum()

50

In [63]:
loan_data.Credit_History.value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [65]:
loan_data.Credit_History = loan_data.Credit_History.fillna(1.0)

### Rechecking the Dataset for Missing Values

In [68]:
loan_data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Now we have good dataset for Machine Lerning 

In [71]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
200,LP001674,Male,Yes,1,Not Graduate,No,2600,2500.0,90.0,360.0,1.0,Semiurban,Y
52,LP001164,Female,No,0,Graduate,No,4230,0.0,112.0,360.0,1.0,Semiurban,N
35,LP001106,Male,Yes,0,Graduate,No,2275,2067.0,146.412162,360.0,1.0,Urban,Y
425,LP002366,Male,Yes,0,Graduate,No,2666,4300.0,121.0,360.0,1.0,Rural,Y
126,LP001448,Male,Yes,3+,Graduate,No,23803,0.0,370.0,360.0,1.0,Rural,Y
463,LP002489,Female,No,1,Not Graduate,No,5191,0.0,132.0,360.0,1.0,Semiurban,Y
573,LP002862,Male,Yes,2,Not Graduate,No,6125,1625.0,187.0,480.0,1.0,Semiurban,N
155,LP001536,Male,Yes,3+,Graduate,No,39999,0.0,600.0,180.0,0.0,Semiurban,Y
517,LP002682,Male,Yes,0,Not Graduate,No,3074,1800.0,123.0,360.0,0.0,Semiurban,N
232,LP001770,Male,No,0,Not Graduate,No,3189,2598.0,120.0,360.0,1.0,Rural,Y


In [73]:
loan_data.shape

(614, 13)

In [75]:
loan_data.head(1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y


### Feature Engineering

In [78]:
for i in [loan_data]:
    loan_data['TotalIncome'] = loan_data['ApplicantIncome'] + loan_data['CoapplicantIncome']

In [80]:
loan_data.shape

(614, 14)

In [82]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
191,LP001656,Male,No,0,Graduate,No,12000,0.0,164.0,360.0,1.0,Semiurban,N,12000.0
327,LP002082,Male,Yes,0,Graduate,Yes,5818,2160.0,184.0,360.0,1.0,Semiurban,Y,7978.0
131,LP001473,Male,No,0,Graduate,No,2014,1929.0,74.0,360.0,1.0,Urban,Y,3943.0
334,LP002103,Male,Yes,1,Graduate,Yes,9833,1833.0,182.0,180.0,1.0,Urban,Y,11666.0
250,LP001835,Male,Yes,0,Not Graduate,No,1668,3890.0,201.0,360.0,0.0,Semiurban,N,5558.0
427,LP002368,Male,Yes,2,Graduate,No,5935,0.0,133.0,360.0,1.0,Semiurban,Y,5935.0
274,LP001896,Male,Yes,2,Graduate,No,3900,0.0,90.0,360.0,1.0,Semiurban,Y,3900.0
232,LP001770,Male,No,0,Not Graduate,No,3189,2598.0,120.0,360.0,1.0,Rural,Y,5787.0
182,LP001636,Male,Yes,0,Graduate,No,4600,0.0,73.0,180.0,1.0,Semiurban,Y,4600.0
289,LP001935,Male,No,0,Graduate,No,9508,0.0,187.0,360.0,1.0,Rural,Y,9508.0


In [84]:
loan_data['TotalIncome'].describe()

count      614.000000
mean      7024.705081
std       6458.663872
min       1442.000000
25%       4166.000000
50%       5416.500000
75%       7521.750000
max      81000.000000
Name: TotalIncome, dtype: float64

In [86]:
# Make sure to use labels=false otherwise interval will be added
loan_data['TotalIncomeGroups'] = pd.cut(loan_data['TotalIncome'], bins=20, include_lowest=True, labels=False)

In [88]:
loan_data.TotalIncomeGroups.unique()

array([ 1,  0,  2,  5,  3,  4,  9, 12,  8, 10, 15, 19,  6], dtype=int64)

In [90]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,TotalIncomeGroups
461,LP002484,Male,Yes,3+,Graduate,No,7740,0.0,128.0,180.0,1.0,Urban,Y,7740.0,1
297,LP001954,Female,Yes,1,Graduate,No,4666,0.0,135.0,360.0,1.0,Urban,Y,4666.0,0
539,LP002740,Male,Yes,3+,Graduate,No,6417,0.0,157.0,180.0,1.0,Rural,Y,6417.0,1
17,LP001036,Female,No,0,Graduate,No,3510,0.0,76.0,360.0,0.0,Urban,N,3510.0,0
310,LP002002,Female,No,0,Graduate,No,2917,0.0,84.0,360.0,1.0,Semiurban,Y,2917.0,0
282,LP001915,Male,Yes,2,Graduate,No,2301,985.799988,78.0,180.0,1.0,Urban,Y,3286.799988,0
319,LP002050,Male,Yes,1,Graduate,Yes,10000,0.0,155.0,360.0,1.0,Rural,N,10000.0,2
349,LP002138,Male,Yes,0,Graduate,No,2625,6250.0,187.0,360.0,1.0,Rural,Y,8875.0,1
535,LP002732,Male,No,0,Not Graduate,No,2550,2042.0,126.0,360.0,1.0,Rural,Y,4592.0,0
63,LP001213,Male,Yes,1,Graduate,No,4945,0.0,146.412162,360.0,0.0,Rural,N,4945.0,0


### Splitting training data

In [93]:
loan_data.shape

(614, 15)

In [95]:
loan_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'TotalIncome', 'TotalIncomeGroups'],
      dtype='object')

In [97]:
loan_data = loan_data[['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'TotalIncome', 'TotalIncomeGroups', 'Loan_Status']]

In [99]:
loan_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'TotalIncome',
       'TotalIncomeGroups', 'Loan_Status'],
      dtype='object')

In [101]:
# Loan_ID is not selected as it does not provide value to the ML process

all_ml_columns = loan_data.iloc[ : , 1 : 14].values
all_status_column = loan_data.iloc[:, 14].values

In [103]:
all_ml_columns[0]

array(['Male', 'No', '0', 'Graduate', 'No', 5849, 0.0, 146.41216216216216,
       360.0, 1.0, 'Urban', 5849.0, 1], dtype=object)

In [105]:
all_status_column

array(['Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'N

#### Generating Training and Test Dataset

In [108]:
from sklearn.model_selection import train_test_split
# test_size = 33% or 1/3

In [109]:
x_train, x_test, y_train, y_test = train_test_split(all_ml_columns, all_status_column, test_size = 1/3, random_state= 0)

In [112]:
x_train

array([['Male', 'Yes', '3+', ..., 'Rural', 5703.0, 1],
       ['Male', 'Yes', '0', ..., 'Rural', 5970.0, 1],
       ['Male', 'Yes', '3+', ..., 'Rural', 4106.0, 0],
       ...,
       ['Male', 'Yes', '3+', ..., 'Semiurban', 8334.0, 1],
       ['Male', 'Yes', '0', ..., 'Urban', 6033.0, 1],
       ['Female', 'Yes', '0', ..., 'Semiurban', 6486.0, 1]], dtype=object)

In [114]:
x_train.shape

(409, 13)

In [116]:
x_test

array([['Male', 'No', '0', ..., 'Semiurban', 7085.0, 1],
       ['Female', 'No', '0', ..., 'Semiurban', 4230.0, 0],
       ['Male', 'Yes', '0', ..., 'Urban', 10039.0, 2],
       ...,
       ['Male', 'Yes', '0', ..., 'Rural', 3716.0, 0],
       ['Male', 'Yes', '2', ..., 'Urban', 2889.0, 0],
       ['Male', 'Yes', '0', ..., 'Rural', 24996.0, 5]], dtype=object)

In [118]:
x_test.shape

(205, 13)

In [120]:
y_train

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [122]:
y_train.shape

(409,)

In [124]:
y_test

array(['Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [126]:
y_test.shape

(205,)

### Logistic Regression (A Quick test to show why you need encoding)

In [129]:
from sklearn.linear_model import LogisticRegression

In [141]:
classifier = LogisticRegression( random_state = 0 )

In [133]:
classifier.fit(x_train,y_train)

ValueError: could not convert string to float: 'Male'

#### Encoding categorical data and independent vars

In [139]:
x_train[0]

array(['Male', 'Yes', '3+', 'Not Graduate', 'Yes', 5703, 0.0, 130.0,
       360.0, 1.0, 'Rural', 5703.0, 1], dtype=object)

We need to work on 6 columns, or we can say encoding 6 columns.

In [143]:
from sklearn.preprocessing import LabelEncoder

In [145]:
labelencoder_X = LabelEncoder()

In [147]:
for i in range(0, 5):
    x_train[: ,i] = labelencoder_X.fit_transform(x_train[:, i])

In [149]:
x_train[:,10] = labelencoder_X.fit_transform(x_train[:,10])

In [151]:
x_train[120]

array([1, 1, 2, 0, 0, 3510, 4416.0, 243.0, 360.0, 1.0, 0, 7926.0, 1],
      dtype=object)

In [153]:
x_train

array([[1, 1, 3, ..., 0, 5703.0, 1],
       [1, 1, 0, ..., 0, 5970.0, 1],
       [1, 1, 3, ..., 0, 4106.0, 0],
       ...,
       [1, 1, 3, ..., 1, 8334.0, 1],
       [1, 1, 0, ..., 2, 6033.0, 1],
       [0, 1, 0, ..., 1, 6486.0, 1]], dtype=object)

#### Dependent Variable or Y Encoding

##### y=f(x)

In [157]:
labelencoder_y = LabelEncoder()

In [159]:
y_train = labelencoder_y.fit_transform(y_train)

In [161]:
y_train

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,

In [163]:
y_train[120]

1

#### Test Dataeset Encoding

In [166]:
for i in range(0, 5):
    x_test[:, i] = labelencoder_X.fit_transform(x_test[:, i])
x_test[:,10] = labelencoder_X.fit_transform(x_test[:, 10])

In [168]:
x_test

array([[1, 0, 0, ..., 1, 7085.0, 1],
       [0, 0, 0, ..., 1, 4230.0, 0],
       [1, 1, 0, ..., 2, 10039.0, 2],
       ...,
       [1, 1, 0, ..., 0, 3716.0, 0],
       [1, 1, 2, ..., 2, 2889.0, 0],
       [1, 1, 0, ..., 0, 24996.0, 5]], dtype=object)

In [170]:
x_test[40]

array([0, 0, 0, 0, 0, 3244, 0.0, 80.0, 360.0, 1.0, 2, 3244.0, 0],
      dtype=object)

In [172]:
y_test = labelencoder_y.fit_transform(y_test)

In [174]:
y_test[100]

1

### Re-applying Logistics Regression

In [177]:
lr_classifier = LogisticRegression(random_state =0)

In [179]:
lr_classifier.fit(x_train,y_train,)

In [181]:
# Predicting the Test Set Results
y_pred_logistic_regression = lr_classifier.predict(x_test)

In [183]:
y_pred_logistic_regression

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1])

### Model Accuracy

In [186]:
from sklearn import metrics

In [188]:
print(' Logistic Regression Accuracy (%) is: ' ,metrics.accuracy_score(y_pred_logistic_regression, y_test))

 Logistic Regression Accuracy (%) is:  0.824390243902439


### Confusion Matrix

In [191]:
from sklearn.metrics import confusion_matrix

In [193]:
lr_confusion_matrix = confusion_matrix(y_test, y_pred_logistic_regression)

In [195]:
lr_confusion_matrix

array([[ 26,  34],
       [  2, 143]], dtype=int64)

### Using Decision Tree Algorithm

In [198]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier

In [200]:
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

In [202]:
dt_classifier.fit(x_train,y_train)

In [204]:
y_pred_decision_tree = dt_classifier.predict(x_test)

In [206]:
y_pred_decision_tree

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1])

### Decision Tree Accuracy

In [209]:
print('Decision Tree Accuracy (%) is:', metrics.accuracy_score(y_pred_decision_tree, y_test))

Decision Tree Accuracy (%) is: 0.7024390243902439


### Confusion Matrics

In [212]:
dt_confusion_matrics = confusion_matrix(y_test, y_pred_decision_tree)

In [214]:
dt_confusion_matrics

array([[ 33,  27],
       [ 34, 111]], dtype=int64)

### Improving Decision Tree Accuracy

#### Setting max_depth parameters in the decision tree classifier

In [218]:
dt_classifier_improved = DecisionTreeClassifier(criterion = 'entropy', random_state=0, max_depth=5)

In [220]:
dt_classifier_improved.fit(x_train, y_train)

In [222]:
y_pred_decision_tree_improved = dt_classifier_improved.predict(x_test)

In [224]:
print('Improved Decision Tree Accuracy (%) is:', metrics.accuracy_score(y_pred_decision_tree_improved, y_test))

Improved Decision Tree Accuracy (%) is: 0.8195121951219512


### Prediction Using a Customer test data

#### Getting a row from X_test Dataset(or Building your own)

In [232]:
x_test[100]

array([1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0, 7977.0, 1],
      dtype=object)

In [234]:
y_test[100]

1

In [236]:
sample_data = [[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0, 7977.0, 1]]

In [238]:
type(sample_data)

list

In [240]:
# We have already imported the NumPy library at the start; we are importing it again for clarity.
import numpy as np

In [244]:
sample_data_item = np.array(sample_data, dtype=object)

In [246]:
sample_data_item

array([[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0, 7977.0, 1]],
      dtype=object)

In [248]:
pred_result = dt_classifier.predict(sample_data_item)

In [250]:
pred_result

array([1])

In [252]:
pred_result[0]

1

In [254]:
dt_classifier_improved.predict(sample_data_item)

array([1])