In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, tree

In [2]:
data = pd.read_csv('carInsurance_train.csv', index_col='Id')

In [3]:
data.head()

Unnamed: 0_level_0,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,32,management,single,tertiary,0,1218,1,0,telephone,28,jan,2,-1,0,,13:45:20,13:46:30,0
2,32,blue-collar,married,primary,0,1156,1,0,,26,may,5,-1,0,,14:49:03,14:52:08,0
3,29,management,single,tertiary,0,637,1,0,cellular,3,jun,1,119,1,failure,16:30:24,16:36:04,1
4,25,student,single,primary,0,373,1,0,cellular,11,may,2,-1,0,,12:06:43,12:20:22,1
5,30,management,married,tertiary,0,2694,0,0,cellular,3,jun,1,-1,0,,14:35:44,14:38:56,0


In [4]:
data.shape

(4000, 18)

In [5]:
data.columns

Index(['Age', 'Job', 'Marital', 'Education', 'Default', 'Balance',
       'HHInsurance', 'CarLoan', 'Communication', 'LastContactDay',
       'LastContactMonth', 'NoOfContacts', 'DaysPassed', 'PrevAttempts',
       'Outcome', 'CallStart', 'CallEnd', 'CarInsurance'],
      dtype='object')

In [6]:
data.describe

<bound method NDFrame.describe of       Age           Job   Marital  Education  Default  Balance  HHInsurance  \
Id                                                                            
1      32    management    single   tertiary        0     1218            1   
2      32   blue-collar   married    primary        0     1156            1   
3      29    management    single   tertiary        0      637            1   
4      25       student    single    primary        0      373            1   
5      30    management   married   tertiary        0     2694            0   
...   ...           ...       ...        ...      ...      ...          ...   
3996   28    technician    single   tertiary        0        0            1   
3997   49        admin.  divorced  secondary        0      124            1   
3998   27        admin.    single  secondary        0     -400            0   
3999   36  entrepreneur    single   tertiary        0      658            1   
4000   45      ser

In [7]:
data.dtypes

Age                  int64
Job                 object
Marital             object
Education           object
Default              int64
Balance              int64
HHInsurance          int64
CarLoan              int64
Communication       object
LastContactDay       int64
LastContactMonth    object
NoOfContacts         int64
DaysPassed           int64
PrevAttempts         int64
Outcome             object
CallStart           object
CallEnd             object
CarInsurance         int64
dtype: object

In [8]:
data_n = data.drop(data.index[1742])

In [9]:
data_n.isnull().sum()

Age                    0
Job                   19
Marital                0
Education            169
Default                0
Balance                0
HHInsurance            0
CarLoan                0
Communication        902
LastContactDay         0
LastContactMonth       0
NoOfContacts           0
DaysPassed             0
PrevAttempts           0
Outcome             3041
CallStart              0
CallEnd                0
CarInsurance           0
dtype: int64

In [10]:
data_n['Job'] = data_n['Job'].astype(object).fillna(method='pad')
data_n['Education'] = data_n['Education'].astype(object).fillna(method='pad')

In [11]:
data_n['Communication'] = data_n['Communication'].astype(object).fillna('none')
data_n['Outcome'] = data_n['Outcome'].astype(object).fillna('none')

In [12]:
data_n.isnull().sum()

Age                 0
Job                 0
Marital             0
Education           0
Default             0
Balance             0
HHInsurance         0
CarLoan             0
Communication       0
LastContactDay      0
LastContactMonth    0
NoOfContacts        0
DaysPassed          0
PrevAttempts        0
Outcome             0
CallStart           0
CallEnd             0
CarInsurance        0
dtype: int64

In [13]:
data_n['AgeBinned'] = pd.qcut(data_n['Age'], 5, labels=False)
data_n['BalanceBinned'] = pd.qcut(data_n['Balance'], 5, labels=False)

In [14]:
time = pd.DatetimeIndex(data_n['CallStart'])
time_s = (time.hour * 60) + time.minute + (time.second/60)
time_s = pd.DataFrame(time_s)

time2 = pd.DatetimeIndex(data_n['CallEnd'])
time_e = (time2.hour * 60) + time2.minute + (time2.second/60)
time_e = pd.DataFrame(time_e)

In [15]:
data_n['CallStart'] = time_s
data_n['CallEnd'] = time_e

In [16]:
data_n['CallDuration'] = data_n['CallEnd'] - data_n['CallStart']

In [17]:
data_n['CallDurationBinned'] = pd.qcut(data_n['Age'], 5, labels=False)

In [18]:
data_n.drop(['Age', 'Balance', 'CallStart', 'CallEnd', 'CallDuration'], axis=1, inplace=True)

In [19]:
Job = pd.get_dummies(data=data_n['Job'], prefix='Job')
Marital = pd.get_dummies(data=data_n['Marital'], prefix='Marital')
Education = pd.get_dummies(data=data_n['Education'], prefix='Education')
Communication = pd.get_dummies(data=data_n['Communication'], prefix='Communication')
LastContactMonth = pd.get_dummies(data=data_n['LastContactMonth'], prefix='LastContactMonth')
Outcome = pd.get_dummies(data=data_n['Outcome'], prefix='Outcome')

In [20]:
data_n.drop(['Job', 'Marital', 'Education', 'Communication', 'LastContactMonth', 'Outcome'], axis=1, inplace=True)

In [21]:
data = pd.concat([data_n, Job, Marital, Education, LastContactMonth, Communication, Outcome], axis=1)

In [22]:
data.columns

Index(['Default', 'HHInsurance', 'CarLoan', 'LastContactDay', 'NoOfContacts',
       'DaysPassed', 'PrevAttempts', 'CarInsurance', 'AgeBinned',
       'BalanceBinned', 'CallDurationBinned', 'Job_admin.', 'Job_blue-collar',
       'Job_entrepreneur', 'Job_housemaid', 'Job_management', 'Job_retired',
       'Job_self-employed', 'Job_services', 'Job_student', 'Job_technician',
       'Job_unemployed', 'Marital_divorced', 'Marital_married',
       'Marital_single', 'Education_primary', 'Education_secondary',
       'Education_tertiary', 'LastContactMonth_apr', 'LastContactMonth_aug',
       'LastContactMonth_dec', 'LastContactMonth_feb', 'LastContactMonth_jan',
       'LastContactMonth_jul', 'LastContactMonth_jun', 'LastContactMonth_mar',
       'LastContactMonth_may', 'LastContactMonth_nov', 'LastContactMonth_oct',
       'LastContactMonth_sep', 'Communication_cellular', 'Communication_none',
       'Communication_telephone', 'Outcome_failure', 'Outcome_none',
       'Outcome_other', 'Out

In [23]:
X = data.drop(['CarInsurance'], axis=1).values
y = data['CarInsurance'].values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [25]:
#knn
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print('knn accuracy score:', accuracy_score(y_test, knn.predict(X_test)))
knn_score = cross_val_score(knn, X, y, cv=10).mean()
print('cross validation score', knn_score)

knn accuracy score: 0.6575
cross validation score 0.6634280839255245


In [26]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
print('lr accuracy score:', accuracy_score(y_test, lr.predict(X_test)))
lr_score = cross_val_score(lr, X, y, cv=10).mean()
print('cross validation score', lr_score)

lr accuracy score: 0.7325




cross validation score 0.7227009403183771




In [27]:
#SVM classifier
SVM = svm.SVC()
SVM.fit(X_train,y_train)
print('SVM accuracy score:', accuracy_score(y_test, SVM.predict(X_test)))
SVM_score = cross_val_score(SVM, X, y, cv=10).mean()
print('cross validation score', SVM_score)



SVM accuracy score: 0.705




cross validation score 0.7107083591147446


In [28]:
# Random Forest
rf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, class_weight='balanced')
rf.fit(X_train, y_train)
print('rf accuracy score:', accuracy_score(y_test, rf.predict(X_test)))
rf_score = cross_val_score(rf, X, y, cv=10).mean()
print('cross validation score', rf_score)

rf accuracy score: 0.71125
cross validation score 0.7289565981662385


In [29]:
# Decision Trees
dt = tree.DecisionTreeClassifier(random_state=0, class_weight='balanced', min_weight_fraction_leaf=0.01)
dt = dt.fit(X_train, y_train)
print('df accuracy score:', accuracy_score(y_test, dt.predict(X_test)))
dt_score = cross_val_score(dt, X, y, cv=10).mean()
print('cross validation score', dt_score)

df accuracy score: 0.64875
cross validation score 0.6851444712154451
