## The Data Augmentation and Perform Machine Learning

In [3]:
# Importing libraries

import pandas as pd
import numpy as np

### Loading Data 

In [6]:
loan_data = pd.read_csv("loan-train.csv")

In [8]:
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


### Checking Empty Values

In [11]:
loan_data.apply(lambda x: sum(x.isnull()), axis=0)

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
loan_data.count()

Loan_ID              614
Gender               601
Married              611
Dependents           599
Education            614
Self_Employed        582
ApplicantIncome      614
CoapplicantIncome    614
LoanAmount           592
Loan_Amount_Term     600
Credit_History       564
Property_Area        614
Loan_Status          614
dtype: int64

In [15]:
loan_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Filling Empty or Null Values

In [18]:
loan_data["Gender"].value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

The higher value is Male here, so the missing value is replaced by Male here.

In [21]:
loan_data.Gender = loan_data.Gender.fillna("Male")

In [23]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Data is updated correctly. Gender does not show any missing values.

In [26]:
loan_data.Married.value_counts()

Married
Yes    398
No     213
Name: count, dtype: int64

In [28]:
loan_data.Married = loan_data.Married.fillna("Yes")

In [30]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [32]:
loan_data.Dependents.value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [34]:
loan_data.Dependents = loan_data.Dependents.fillna("0")

In [36]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [38]:
loan_data.Self_Employed.value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [40]:
loan_data.Self_Employed = loan_data.Self_Employed.fillna('No')

In [42]:
loan_data.apply(lambda x: sum(x.isnull()),axis=0)

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

#### Filling Loan Amount Data

loan_data.LoanAmount

In [46]:
loan_data.LoanAmount.min()

9.0

In [48]:
loan_data.LoanAmount.max()

700.0

In [50]:
loan_data.LoanAmount.mean()

146.41216216216216

In [52]:
loan_data.LoanAmount = loan_data.LoanAmount.fillna(loan_data.LoanAmount.mean())

#### Filling Loan Amount Term Data

In [55]:
loan_data.Loan_Amount_Term.isnull().sum()

14

In [57]:
loan_data.Loan_Amount_Term.value_counts()

Loan_Amount_Term
360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [59]:
loan_data.Loan_Amount_Term = loan_data.Loan_Amount_Term.fillna(360.0)

#### Working with Credit History

In [62]:
loan_data.Credit_History.isnull().sum()

50

In [64]:
loan_data.Credit_History.value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [66]:
loan_data.Credit_History = loan_data.Credit_History.fillna(1.0)

### Rechecking the Dataset for Missing Values

In [69]:
loan_data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Now we have good dataset for Machine Lerning 

In [72]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
538,LP002739,Male,Yes,0,Not Graduate,No,2917,536.0,66.0,360.0,1.0,Rural,N
250,LP001835,Male,Yes,0,Not Graduate,No,1668,3890.0,201.0,360.0,0.0,Semiurban,N
573,LP002862,Male,Yes,2,Not Graduate,No,6125,1625.0,187.0,480.0,1.0,Semiurban,N
453,LP002449,Male,Yes,0,Graduate,No,2483,2466.0,90.0,180.0,0.0,Rural,Y
284,LP001922,Male,Yes,0,Graduate,No,20667,0.0,146.412162,360.0,1.0,Rural,N
311,LP002004,Male,No,0,Not Graduate,No,2927,2405.0,111.0,360.0,1.0,Semiurban,Y
318,LP002043,Female,No,1,Graduate,No,3541,0.0,112.0,360.0,1.0,Semiurban,Y
264,LP001872,Male,No,0,Graduate,Yes,5166,0.0,128.0,360.0,1.0,Semiurban,Y
345,LP002129,Male,Yes,0,Graduate,No,2499,2458.0,160.0,360.0,1.0,Semiurban,Y


In [74]:
loan_data.shape

(614, 13)

In [76]:
loan_data.head(1)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y


### Merging ZipCode Living Index Dataset based on Loan_ID

In [79]:
living_index = pd.read_csv('zipwithliving_index.csv')

In [81]:
living_index.shape

(614, 3)

In [83]:
living_index

Unnamed: 0,Loan_ID,ZipCode,LivingRank
0,LP001002,94188,1
1,LP001106,94188,1
2,LP001137,94188,1
3,LP001245,94188,1
4,LP001253,94188,1
...,...,...,...
609,LP002201,94121,4
610,LP002231,94121,4
611,LP002364,94121,4
612,LP002813,94121,4


In [85]:
len(living_index['Loan_ID'].unique())

614

In [87]:
updated_loan_data = pd.merge(loan_data, living_index)

In [89]:
updated_loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,ZipCode,LivingRank
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y,94188,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N,94164,5
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y,94120,3
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y,94104,4
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y,94169,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y,94118,1
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,1.0,Rural,Y,94170,1
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y,94167,3
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,1.0,Urban,Y,94121,4


In [91]:
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,1.0,Urban,Y


#### Finalizing the Source Dataset

In [94]:
updated_loan_data.drop(['ZipCode'], axis = 1, inplace= True)

In [96]:
updated_loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LivingRank
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N,5
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y,3
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y,4
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y,1
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,1.0,Rural,Y,1
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y,3
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,1.0,Urban,Y,4


In [98]:
loan_data = updated_loan_data

In [100]:
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LivingRank
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.000000,360.0,1.0,Rural,N,5
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.000000,360.0,1.0,Urban,Y,3
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.000000,360.0,1.0,Urban,Y,4
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.000000,360.0,1.0,Urban,Y,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.000000,360.0,1.0,Rural,Y,1
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.000000,180.0,1.0,Rural,Y,1
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.000000,360.0,1.0,Urban,Y,3
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.000000,360.0,1.0,Urban,Y,4


### Feature Engineering

In [128]:
for i in [loan_data]:
    loan_data['TotalIncome'] = loan_data['ApplicantIncome'] + loan_data['CoapplicantIncome']

In [130]:
loan_data.shape

(614, 15)

In [132]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,LivingRank,Loan_Status,TotalIncome
586,LP002916,Male,Yes,0,Graduate,No,2297,1522.0,104.0,360.0,1.0,Urban,3,Y,3819.0
526,LP002705,Male,Yes,0,Graduate,No,3775,0.0,110.0,360.0,1.0,Semiurban,4,Y,3775.0
472,LP002519,Male,Yes,3+,Graduate,No,4691,0.0,100.0,360.0,1.0,Semiurban,3,Y,4691.0
411,LP002319,Male,Yes,0,Graduate,No,6256,0.0,160.0,360.0,1.0,Urban,4,Y,6256.0
293,LP001945,Female,No,0,Graduate,No,5417,0.0,143.0,480.0,0.0,Urban,2,N,5417.0
477,LP002530,Male,Yes,2,Graduate,No,2873,1872.0,132.0,360.0,0.0,Semiurban,3,N,4745.0
268,LP001883,Female,No,0,Graduate,No,3418,0.0,135.0,360.0,1.0,Rural,5,N,3418.0
215,LP001720,Male,Yes,3+,Not Graduate,No,3850,983.0,100.0,360.0,1.0,Semiurban,5,Y,4833.0
354,LP002143,Female,Yes,0,Graduate,No,2423,505.0,130.0,360.0,1.0,Semiurban,1,Y,2928.0
150,LP001528,Male,No,0,Graduate,No,6277,0.0,118.0,360.0,0.0,Rural,1,N,6277.0


In [134]:
loan_data['TotalIncome'].describe()

count      614.000000
mean      7024.705081
std       6458.663872
min       1442.000000
25%       4166.000000
50%       5416.500000
75%       7521.750000
max      81000.000000
Name: TotalIncome, dtype: float64

In [136]:
# Make sure to use labels=false otherwise interval will be added
loan_data['TotalIncomeGroups'] = pd.cut(loan_data['TotalIncome'], bins=20, include_lowest=True, labels=False)

In [138]:
loan_data.TotalIncomeGroups.unique()

array([ 1,  0,  2,  5,  3,  4,  9, 12,  8, 10, 15, 19,  6], dtype=int64)

In [140]:
loan_data.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,LivingRank,Loan_Status,TotalIncome,TotalIncomeGroups
497,LP002588,Male,Yes,0,Graduate,No,4625,2857.0,111.0,12.0,1.0,Urban,1,Y,7482.0,1
544,LP002757,Female,Yes,0,Not Graduate,No,3017,663.0,102.0,360.0,1.0,Semiurban,1,Y,3680.0,0
595,LP002940,Male,No,0,Not Graduate,No,3833,0.0,110.0,360.0,1.0,Rural,1,Y,3833.0,0
22,LP001047,Male,Yes,0,Not Graduate,No,2600,1911.0,116.0,360.0,0.0,Semiurban,4,N,4511.0,0
171,LP001585,Male,Yes,3+,Graduate,No,51763,0.0,700.0,300.0,1.0,Urban,4,Y,51763.0,12
563,LP002821,Male,No,0,Not Graduate,Yes,5800,0.0,132.0,360.0,1.0,Semiurban,5,Y,5800.0,1
127,LP001449,Male,No,0,Graduate,No,3865,1640.0,146.412162,360.0,1.0,Rural,1,Y,5505.0,1
411,LP002319,Male,Yes,0,Graduate,No,6256,0.0,160.0,360.0,1.0,Urban,4,Y,6256.0,1
19,LP001041,Male,Yes,0,Graduate,No,2600,3500.0,115.0,360.0,1.0,Urban,4,Y,6100.0,1
128,LP001451,Male,Yes,1,Graduate,Yes,10513,3850.0,160.0,180.0,0.0,Urban,2,N,14363.0,3


### Splitting training data

In [143]:
loan_data.shape

(614, 16)

In [145]:
loan_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'LivingRank',
       'Loan_Status', 'TotalIncome', 'TotalIncomeGroups'],
      dtype='object')

In [147]:
loan_data = loan_data[['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'LivingRank',
       'TotalIncome', 'TotalIncomeGroups','Loan_Status']]

In [149]:
loan_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'LivingRank',
       'TotalIncome', 'TotalIncomeGroups', 'Loan_Status'],
      dtype='object')

In [151]:
# Loan_ID is not selected as it does not provide value to the ML process

all_ml_columns = loan_data.iloc[ : , 1: 15].values
all_status_column = loan_data.iloc[:, 15].values

In [153]:
all_ml_columns[0]

array(['Male', 'No', '0', 'Graduate', 'No', 5849, 0.0, 146.41216216216216,
       360.0, 1.0, 'Urban', 1, 5849.0, 1], dtype=object)

In [155]:
all_status_column

array(['Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'N

#### Generating Training and Test Dataset

In [158]:
from sklearn.model_selection import train_test_split
# test_size = 33% or 1/3

In [159]:
x_train, x_test, y_train, y_test = train_test_split(all_ml_columns, all_status_column, test_size = 1/3, random_state= 0)

In [160]:
x_train

array([['Male', 'Yes', '3+', ..., 3, 5703.0, 1],
       ['Male', 'Yes', '0', ..., 5, 5970.0, 1],
       ['Male', 'Yes', '3+', ..., 1, 4106.0, 0],
       ...,
       ['Male', 'Yes', '3+', ..., 2, 8334.0, 1],
       ['Male', 'Yes', '0', ..., 1, 6033.0, 1],
       ['Female', 'Yes', '0', ..., 1, 6486.0, 1]], dtype=object)

In [161]:
x_train.shape

(409, 14)

In [162]:
x_test

array([['Male', 'No', '0', ..., 1, 7085.0, 1],
       ['Female', 'No', '0', ..., 2, 4230.0, 0],
       ['Male', 'Yes', '0', ..., 2, 10039.0, 2],
       ...,
       ['Male', 'Yes', '0', ..., 1, 3716.0, 0],
       ['Male', 'Yes', '2', ..., 3, 2889.0, 0],
       ['Male', 'Yes', '0', ..., 5, 24996.0, 5]], dtype=object)

In [163]:
x_test.shape

(205, 14)

In [170]:
y_train

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [172]:
y_train.shape

(409,)

In [174]:
y_test

array(['Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y

In [176]:
y_test.shape

(205,)

### Logistic Regression (A Quick test to show why you need encoding)

In [179]:
from sklearn.linear_model import LogisticRegression

In [181]:
classifier = LogisticRegression( random_state = 0 )

In [183]:
classifier.fit(x_train,y_train)

ValueError: could not convert string to float: 'Male'

#### Encoding categorical data and independent vars

In [276]:
x_train[0]

array([1, 1, 3, 1, 1, 5703, 0.0, 130.0, 360.0, 1.0, 0, 3, 5703.0, 1],
      dtype=object)

We need to work on 6 columns, or we can say encoding 6 columns.

In [279]:
from sklearn.preprocessing import LabelEncoder

In [281]:
labelencoder_X = LabelEncoder()

In [283]:
for i in range(0, 5):
    x_train[: ,i] = labelencoder_X.fit_transform(x_train[:, i])

In [285]:
x_train[:,10] = labelencoder_X.fit_transform(x_train[:,10])

In [287]:
x_train[120]

array([1, 1, 2, 0, 0, 3510, 4416.0, 243.0, 360.0, 1.0, 0, 3, 7926.0, 1],
      dtype=object)

In [289]:
x_train

array([[1, 1, 3, ..., 3, 5703.0, 1],
       [1, 1, 0, ..., 5, 5970.0, 1],
       [1, 1, 3, ..., 1, 4106.0, 0],
       ...,
       [1, 1, 3, ..., 2, 8334.0, 1],
       [1, 1, 0, ..., 1, 6033.0, 1],
       [0, 1, 0, ..., 1, 6486.0, 1]], dtype=object)

#### Dependent Variable or Y Encoding

##### y=f(x)

In [293]:
labelencoder_y = LabelEncoder()

In [295]:
y_train = labelencoder_y.fit_transform(y_train)

In [297]:
y_train

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,

In [299]:
y_train[120]

1

#### Test Dataeset Encoding

In [302]:
for i in range(0, 5):
    x_test[:, i] = labelencoder_X.fit_transform(x_test[:, i])
x_test[:,10] = labelencoder_X.fit_transform(x_test[:, 10])

In [304]:
x_test

array([[1, 0, 0, ..., 1, 7085.0, 1],
       [0, 0, 0, ..., 2, 4230.0, 0],
       [1, 1, 0, ..., 2, 10039.0, 2],
       ...,
       [1, 1, 0, ..., 1, 3716.0, 0],
       [1, 1, 2, ..., 3, 2889.0, 0],
       [1, 1, 0, ..., 5, 24996.0, 5]], dtype=object)

In [306]:
x_test[40]

array([0, 0, 0, 0, 0, 3244, 0.0, 80.0, 360.0, 1.0, 2, 3, 3244.0, 0],
      dtype=object)

In [308]:
y_test = labelencoder_y.fit_transform(y_test)

In [310]:
y_test[100]

1

### Re-applying Logistics Regression

In [313]:
lr_classifier = LogisticRegression(random_state =0)

In [315]:
lr_classifier.fit(x_train,y_train,)

In [317]:
# Predicting the Test Set Results
y_pred_logistic_regression = lr_classifier.predict(x_test)

In [319]:
y_pred_logistic_regression

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1], dtype=int64)

### Model Accuracy

In [322]:
from sklearn import metrics

In [324]:
print(' Logistic Regression Accuracy (%) is: ' ,metrics.accuracy_score(y_pred_logistic_regression, y_test))

 Logistic Regression Accuracy (%) is:  0.7707317073170732


### Confusion Matrix

In [327]:
from sklearn.metrics import confusion_matrix

In [329]:
lr_confusion_matrix = confusion_matrix(y_test, y_pred_logistic_regression)

In [331]:
lr_confusion_matrix

array([[ 22,  38],
       [  9, 136]], dtype=int64)

### Using Decision Tree Algorithm

In [334]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier

In [336]:
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

In [338]:
dt_classifier.fit(x_train,y_train)

In [340]:
y_pred_decision_tree = dt_classifier.predict(x_test)

In [342]:
y_pred_decision_tree

array([1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1], dtype=int64)

### Decision Tree Accuracy

In [345]:
print('Decision Tree Accuracy (%) is:', metrics.accuracy_score(y_pred_decision_tree, y_test))

Decision Tree Accuracy (%) is: 0.6926829268292682


### Confusion Matrics

In [348]:
dt_confusion_matrics = confusion_matrix(y_test, y_pred_decision_tree)

In [350]:
dt_confusion_matrics

array([[ 33,  27],
       [ 36, 109]], dtype=int64)

### Improving Decision Tree Accuracy

#### Setting max_depth parameters in the decision tree classifier

In [354]:
dt_classifier_improved = DecisionTreeClassifier(criterion = 'entropy', random_state=0, max_depth=5)

In [356]:
dt_classifier_improved.fit(x_train, y_train)

In [358]:
y_pred_decision_tree_improved = dt_classifier_improved.predict(x_test)

In [360]:
print('Improved Decision Tree Accuracy (%) is:', metrics.accuracy_score(y_pred_decision_tree_improved, y_test))

Improved Decision Tree Accuracy (%) is: 0.8146341463414634


### Prediction Using a Customer test data

#### Getting a row from X_test Dataset(or Building your own)

In [384]:
x_test[100]

array([1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0, 3, 7977.0, 1],
      dtype=object)

In [366]:
y_test[100]

1

In [386]:
sample_data = [[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0, 3, 7977.0, 1]]

In [388]:
type(sample_data)

list

In [390]:
# We have already imported the NumPy library at the start; we are importing it again for clarity.
import numpy as np

In [392]:
sample_data_item = np.array(sample_data, dtype=object)

In [394]:
sample_data_item

array([[1, 1, 0, 0, 0, 5923, 2054.0, 211.0, 360.0, 1.0, 0, 3, 7977.0, 1]],
      dtype=object)

In [396]:
pred_result = dt_classifier.predict(sample_data_item)

In [398]:
pred_result

array([1], dtype=int64)

In [400]:
pred_result[0]

1

In [402]:
dt_classifier_improved.predict(sample_data_item)

array([1], dtype=int64)