# Introduction

This notebook contains the code wriiten for my participation in the Data Science Nigeria 2019 challenge insurance prediction hosted on Zindi Africa


# Here is the competition link

- https://zindi.africa/competitions/data-science-nigeria-2019-challenge-1-insurance-prediction/

- You can join the competition (make sure you are registered on zindi before hand) and download the data here

- https://zindi.africa/competitions/data-science-nigeria-2019-challenge-1-insurance-prediction/data

- Also make submissions on

- https://zindi.africa/competitions/data-science-nigeria-2019-challenge-1-insurance-prediction/submissions

- Upload your csv submission file to make submission

# Importing the necessary libraries and Exploring the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from rgf.sklearn import RGFClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

sns.set()
%matplotlib inline

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)

In [2]:
np.random.seed(23)

In [3]:
train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')
vd = pd.read_csv('data/VariableDescription.csv')
submit = pd.read_csv('data/sample_submission.csv')

In [4]:
train.head(20)

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0
5,H4977,2012,1.0,0,V,N,O,R,535.0,1,1980.0,3,1143,0
6,H7390,2012,1.0,0,N,V,V,U,2830.0,1,1988.0,.,1143,0
7,H14488,2015,1.0,0,N,V,V,U,4952.0,1,1988.0,.,1160,0
8,H19355,2014,1.0,0,V,N,O,R,2735.0,1,2013.0,3,1173,1
9,H18601,2015,1.0,0,V,N,O,R,520.0,1,2011.0,2,1224,0


In [5]:
train.shape

(7160, 14)

In [6]:
train.describe()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building Dimension,Building_Type,Date_of_Occupancy,Claim
count,7160.0,7160.0,7160.0,7054.0,7160.0,6652.0,7160.0
mean,2013.669553,0.909758,0.305447,1883.72753,2.186034,1964.456404,0.228212
std,1.383769,0.239756,0.460629,2278.157745,0.940632,36.002014,0.419709
min,2012.0,0.0,0.0,1.0,1.0,1545.0,0.0
25%,2012.0,0.997268,0.0,528.0,2.0,1960.0,0.0
50%,2013.0,1.0,0.0,1083.0,2.0,1970.0,0.0
75%,2015.0,1.0,1.0,2289.75,3.0,1980.0,0.0
max,2016.0,1.0,1.0,20940.0,4.0,2016.0,1.0


In [7]:
train.info

<bound method DataFrame.info of      Customer Id  YearOfObservation  Insured_Period  Residential  \
0         H14663               2013        1.000000            0   
1          H2037               2015        1.000000            0   
2          H3802               2014        1.000000            0   
3          H3834               2013        1.000000            0   
4          H5053               2014        1.000000            0   
...          ...                ...             ...          ...   
7155       H5290               2012        1.000000            1   
7156       H5926               2013        1.000000            0   
7157       H6204               2016        0.038251            0   
7158       H6537               2013        1.000000            0   
7159       H7470               2014        1.000000            0   

     Building_Painted Building_Fenced Garden Settlement  Building Dimension  \
0                   N               V      V          U               29

In [8]:
vd

Unnamed: 0,Variable,Description
0,Customer Id,Identification number for the Policy holder
1,YearOfObservation,year of observation for the insured policy
2,Insured_Period,"duration of insurance policy in Olusola Insurance. (Ex: Full year insurance, Policy Duration = 1; 6 months = 0.5"
3,Residential,is the building a residential building or not
4,Building_Painted,"is the building painted or not (N-Painted, V-Not Painted)"
5,Building_Fenced,"is the building fence or not (N-Fenced, V-Not Fenced)"
6,Garden,building has garden or not (V-has garden; O-no garden)
7,Settlement,Area where the building is located. (R- rural area; U- urban area)
8,Building Dimension,Size of the insured building in m2
9,Building_Type,"The type of building (Type 1, 2, 3, 4)"


In [9]:
test.head(20)

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
5,H10545,2012,1.0,0,V,V,V,U,3225.0,2,1988.0,.,4070
6,H8962,2015,0.986301,0,N,V,V,U,31.0,2,,.,4070
7,H1015,2013,1.0,0,V,V,V,U,1400.0,2,1980.0,.,4088
8,H9710,2012,1.0,0,V,V,V,U,1300.0,2,,.,4094
9,H9029,2012,1.0,0,N,V,V,U,1200.0,2,,.,4205


In [10]:
test.shape

(3069, 13)

In [11]:
test.describe()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building Dimension,Building_Type,Date_of_Occupancy
count,3069.0,3069.0,3069.0,3056.0,3069.0,2341.0
mean,2013.593679,0.922806,0.224177,1666.790576,2.3594,1966.781717
std,1.372138,0.219066,0.417107,2251.180599,0.998464,25.238702
min,2012.0,0.0,0.0,1.0,1.0,1750.0
25%,2012.0,1.0,0.0,470.0,2.0,1960.0
50%,2013.0,1.0,0.0,900.0,2.0,1968.0
75%,2015.0,1.0,0.0,1957.0,3.0,1980.0
max,2016.0,1.0,1.0,30745.0,4.0,2012.0


In [12]:
test.info

<bound method DataFrame.info of      Customer Id  YearOfObservation  Insured_Period  Residential  \
0         H11920               2013        1.000000            0   
1         H11921               2016        0.997268            0   
2          H9805               2013        0.369863            0   
3          H7493               2014        1.000000            0   
4          H7494               2016        1.000000            0   
...          ...                ...             ...          ...   
3064      H11583               2015        1.000000            0   
3065      H11720               2012        1.000000            0   
3066      H11721               2012        1.000000            0   
3067      H12408               2013        1.000000            0   
3068       H9021               2012        1.000000            0   

     Building_Painted Building_Fenced Garden Settlement  Building Dimension  \
0                   V               N      O          R               30

In [13]:
submit.head()

Unnamed: 0,Customer Id,Claim
0,H0,1
1,H10000,1
2,H10001,1
3,H10002,1
4,H10003,1


In [14]:
submit.shape

(3068, 2)

In [15]:
submit.info

<bound method DataFrame.info of      Customer Id  Claim
0             H0      1
1         H10000      1
2         H10001      1
3         H10002      1
4         H10003      1
...          ...    ...
3063       H9987      1
3064       H9988      1
3065       H9994      1
3066       H9996      1
3067       H9998      1

[3068 rows x 2 columns]>

In [16]:
submit.describe()

Unnamed: 0,Claim
count,3068.0
mean,1.0
std,0.0
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [17]:
# Checking for duplicates
train.duplicated().sum()

0

In [18]:
# Checking for duplicates
test.duplicated().sum()

0

In [19]:
# Dropping columns that do not contribute to predicting our target 
train.drop(['Customer Id','Geo_Code'], inplace= True, axis=1) 

In [20]:
# Doing same in the test data too
test.drop(['Customer Id', 'Geo_Code'], inplace= True, axis=1)

In [21]:
# Checking for missing varibles
train.isnull().sum()

YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Claim                   0
dtype: int64

In [22]:
test.isnull().sum()

YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  4
Settlement              0
Building Dimension     13
Building_Type           0
Date_of_Occupancy     728
NumberOfWindows         0
dtype: int64

In [23]:
# Doing forward fill on the missing variables
train.fillna( method='ffill', inplace=True)

In [24]:
test.fillna(method='ffill', inplace= True)

# Feature Engineering

In [25]:
train['NumberOfWindows'].value_counts()

   .    3551
4        939
3        844
5        639
2        363
6        306
7        211
8        116
1         75
>=10      67
9         49
Name: NumberOfWindows, dtype: int64

In [26]:
def window_code(x):
    if x == '>=10':
        return 10
    elif x == '   .':
        return -1
    else:
        return int(x)


train['NumberOfWindows']= train['NumberOfWindows'].apply(window_code)
test['NumberOfWindows']= test['NumberOfWindows'].apply(window_code)

In [27]:
train['NumberOfWindows'].value_counts()

-1     3551
 4      939
 3      844
 5      639
 2      363
 6      306
 7      211
 8      116
 1       75
 10      67
 9       49
Name: NumberOfWindows, dtype: int64

In [28]:
combined = train.append(test, ignore_index=True).copy()

In [29]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10229 non-null  int64  
 1   Insured_Period      10229 non-null  float64
 2   Residential         10229 non-null  int64  
 3   Building_Painted    10229 non-null  object 
 4   Building_Fenced     10229 non-null  object 
 5   Garden              10229 non-null  object 
 6   Settlement          10229 non-null  object 
 7   Building Dimension  10229 non-null  float64
 8   Building_Type       10229 non-null  int64  
 9   Date_of_Occupancy   10229 non-null  float64
 10  NumberOfWindows     10229 non-null  int64  
 11  Claim               7160 non-null   float64
dtypes: float64(4), int64(4), object(4)
memory usage: 959.1+ KB


In [30]:
# Encoding categorical varibles
combined= pd.get_dummies(combined)

In [31]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10229 non-null  int64  
 1   Insured_Period      10229 non-null  float64
 2   Residential         10229 non-null  int64  
 3   Building Dimension  10229 non-null  float64
 4   Building_Type       10229 non-null  int64  
 5   Date_of_Occupancy   10229 non-null  float64
 6   NumberOfWindows     10229 non-null  int64  
 7   Claim               7160 non-null   float64
 8   Building_Painted_N  10229 non-null  uint8  
 9   Building_Painted_V  10229 non-null  uint8  
 10  Building_Fenced_N   10229 non-null  uint8  
 11  Building_Fenced_V   10229 non-null  uint8  
 12  Garden_O            10229 non-null  uint8  
 13  Garden_V            10229 non-null  uint8  
 14  Settlement_R        10229 non-null  uint8  
 15  Settlement_U        10229 non-null  uint8  
dtypes: f

In [32]:
combined.dtypes

YearOfObservation       int64
Insured_Period        float64
Residential             int64
Building Dimension    float64
Building_Type           int64
Date_of_Occupancy     float64
NumberOfWindows         int64
Claim                 float64
Building_Painted_N      uint8
Building_Painted_V      uint8
Building_Fenced_N       uint8
Building_Fenced_V       uint8
Garden_O                uint8
Garden_V                uint8
Settlement_R            uint8
Settlement_U            uint8
dtype: object

# Spliting the data for modelling

In [33]:
train = combined[:7160].copy()
test = combined[7160:].copy()

In [34]:
test.drop('Claim', axis=1, inplace=True)

In [35]:
X = train.drop('Claim', axis=1).copy()
y = train['Claim'].copy()

In [36]:
X.shape,y.shape

((7160, 15), (7160,))

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5, stratify=y) 

# Modelling

- Training several models and choosing the best performing model
- the evaluation matrix being used for this competition is the `roc_auc` metric 

In [None]:


lg = LogisticRegression(max_iter=10000, random_state=5, n_jobs=-1, verbose=5)
lg.fit(X_train, y_train)
pred = lg.predict_proba(X_test)
pred = [x[1] for x in pred]
print(roc_auc_score(y_test, pred))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


0.7006929913099136


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   10.4s finished


In [None]:
svc = SVC(C=1.2, gamma=0.9, probability=True,random_state=5, verbose=5,kernel='rbf')
svc.fit(X_train, y_train)
pred1 = svc.predict_proba(X_test)
pred1 = [x[1] for x in pred1]
print(roc_auc_score(y_test, pred1))

[LibSVM]0.5859063046207627


In [None]:
xgb = XGBClassifier(random_state=5, learning_rate=0.01)
xgb.fit(X_train, y_train)
pred2 = xgb.predict_proba(X_test)
pred2 = [x[1] for x in pred2]
print(roc_auc_score(y_test, pred2))



0.7072376357056696


In [None]:

lgm = LGBMClassifier(random_state=5,learning_rate=0.01)
lgm.fit(X_train, y_train)
pred3 = lgm.predict_proba(X_test)
pred3 = [x[1] for x in pred3]
print(roc_auc_score(y_test, pred3))

0.7051020408163264


In [None]:
rgf = RGFClassifier(learning_rate= 0.01)
rgf.fit(X_train, y_train)
pred4 = rgf.predict_proba(X_test)
pred4 = [x[1] for x in pred4]
print(roc_auc_score(y_test, pred4))



0.712476305359297


In [None]:
rfc = RandomForestClassifier(random_state=5)
rfc.fit(X_train, y_train)
pred8 = rfc.predict_proba(X_test)
predx = rfc.predict(X_test)
pred8 = [x[1] for x in pred8]
print(roc_auc_score(y_test, pred8))
print(classification_report(y_test, predx))

0.670462322444056
              precision    recall  f1-score   support

         0.0       0.81      0.91      0.86      1658
         1.0       0.47      0.28      0.35       490

    accuracy                           0.76      2148
   macro avg       0.64      0.59      0.60      2148
weighted avg       0.73      0.76      0.74      2148



In [44]:
ctbc = CatBoostClassifier(learning_rate= 0.01)
ctbc.fit(X_train, y_train)
pred4 = ctbc.predict_proba(X_test)
pred4 = [x[1] for x in pred4]
print(roc_auc_score(y_test, pred4))

0:	learn: 0.6885792	total: 171ms	remaining: 2m 50s
1:	learn: 0.6845003	total: 180ms	remaining: 1m 29s
2:	learn: 0.6802274	total: 195ms	remaining: 1m 4s
3:	learn: 0.6761674	total: 205ms	remaining: 51.1s
4:	learn: 0.6720921	total: 222ms	remaining: 44.2s
5:	learn: 0.6677554	total: 239ms	remaining: 39.7s
6:	learn: 0.6636984	total: 255ms	remaining: 36.2s
7:	learn: 0.6594815	total: 270ms	remaining: 33.5s
8:	learn: 0.6559958	total: 285ms	remaining: 31.4s
9:	learn: 0.6520384	total: 300ms	remaining: 29.7s
10:	learn: 0.6482278	total: 315ms	remaining: 28.3s
11:	learn: 0.6446524	total: 333ms	remaining: 27.4s
12:	learn: 0.6409847	total: 370ms	remaining: 28.1s
13:	learn: 0.6372513	total: 387ms	remaining: 27.2s
14:	learn: 0.6338819	total: 408ms	remaining: 26.8s
15:	learn: 0.6306594	total: 418ms	remaining: 25.7s
16:	learn: 0.6276279	total: 429ms	remaining: 24.8s
17:	learn: 0.6243774	total: 446ms	remaining: 24.3s
18:	learn: 0.6212310	total: 458ms	remaining: 23.6s
19:	learn: 0.6181347	total: 471ms	remai

# Making prediction on the test data using stratified Kfold cross validation

In [40]:
StratifiedKFold()


StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [42]:
skf = StratifiedKFold( n_splits=5, shuffle=True, random_state=5)

In [None]:
i = 0

for train_index, test_index in skf.split(X, y):
    i+=1
    print('{} of KFold {}'.format(i,skf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.loc[train_index], y.loc[test_index]

    #model
    xgbc = XGBClassifier(random_state=5,
                        learning_rate= 0.01, 
                        max_depth=5)
    xgbc.fit(xtr, ytr)
    pred101 = xgbc.predict_proba(xvl)
    pred101 = [x[1] for x in pred101]
    print(roc_auc_score(yvl, pred101))


 

1 of KFold 5




0.690792831072011
1 of KFold 5




0.7185409661394552
1 of KFold 5




0.7192079372327619
1 of KFold 5




0.711550223476829
1 of KFold 5




0.7003265667593784


In [43]:
i=0

for train_index, test_index in skf.split(X, y):
    i+=1
    print('{} of KFold {}'.format(i,skf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.loc[train_index], y.loc[test_index]

    #model
    rgfc = RGFClassifier(
                        learning_rate= 0.01
                        )
    rgfc.fit(xtr, ytr)
    pred102 = rgfc.predict_proba(xvl)
    pred102 = [x[1] for x in pred102]
    print(roc_auc_score(yvl, pred102))
 

1 of KFold 5




0.7001991368885832
2 of KFold 5
0.7336156198541519
3 of KFold 5
0.7205294256022805
4 of KFold 5
0.7168610845890931
5 of KFold 5
0.7225677003334855


In [None]:
i=1

for train_index, test_index in skf.split(X, y):
    i+=1
    print('{} of KFold {}'.format(i,skf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.loc[train_index], y.loc[test_index]

    #model
    ctb = CatBoostClassifier(random_state=5,
                        learning_rate= 0.01, 
                        max_depth=5)
    ctb.fit(xtr, ytr)
    pred100 = ctb.predict_proba(xvl)
    pred100 = [x[1] for x in pred100]
    print(roc_auc_score(yvl, pred100))
 

1 of KFold 5
0:	learn: 0.6886886	total: 65.6ms	remaining: 1m 5s
1:	learn: 0.6843838	total: 185ms	remaining: 1m 32s
2:	learn: 0.6801509	total: 250ms	remaining: 1m 23s
3:	learn: 0.6758104	total: 361ms	remaining: 1m 29s
4:	learn: 0.6716200	total: 501ms	remaining: 1m 39s
5:	learn: 0.6676551	total: 576ms	remaining: 1m 35s
6:	learn: 0.6635953	total: 660ms	remaining: 1m 33s
7:	learn: 0.6598227	total: 752ms	remaining: 1m 33s
8:	learn: 0.6560091	total: 1.14s	remaining: 2m 5s
9:	learn: 0.6521839	total: 1.25s	remaining: 2m 3s
10:	learn: 0.6485119	total: 1.31s	remaining: 1m 58s
11:	learn: 0.6449673	total: 1.39s	remaining: 1m 54s
12:	learn: 0.6414377	total: 1.46s	remaining: 1m 50s
13:	learn: 0.6379370	total: 1.53s	remaining: 1m 47s
14:	learn: 0.6348574	total: 1.59s	remaining: 1m 44s
15:	learn: 0.6316161	total: 1.74s	remaining: 1m 46s
16:	learn: 0.6283651	total: 1.91s	remaining: 1m 50s
17:	learn: 0.6251121	total: 2.03s	remaining: 1m 50s
18:	learn: 0.6219812	total: 2.52s	remaining: 2m 10s
19:	learn: 

# Saving predictions for models with good performance and making submissions

- Saving it to a csv file and making submission using the csv file on zindi


In [None]:
predy1 = rgfc.predict_proba(test)
predy1 = [x[1] for x in predy1]

In [None]:
test1 = pd.read_csv('data/test_data.csv')

test1['Claim'] = predy1

submit = test1[['Customer Id', 'Claim']].copy()

In [None]:
submit.to_csv('data/Submission1.csv', index=False)

In [None]:
predy2 = ctb.predict_proba(test)
predy2 = [x[1] for x in predy2]


In [None]:
test2 = pd.read_csv('data/test_data.csv')

test2['Claim'] = predy2

submit = test2[['Customer Id', 'Claim']].copy()

In [None]:
submit.to_csv('data/Submission3.csv', index=False)

In [None]:
predy3 = xgbc.predict_proba(test)
predy3 = [x[1] for x in predy3]

In [None]:
test3 = pd.read_csv('data/test_data.csv')

test3['Claim'] = predy3

submit = test3[['Customer Id', 'Claim']].copy()

In [None]:
submit.to_csv('data/Submission3.csv', index=False)

# Ensembling the models using a VotingClassifier and making another submisison
- This should yield better performance


In [None]:
i=1

for train_index, test_index in skf.split(X, y):
    i+=1
    print('{} of KFold {}'.format(i,skf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y.loc[train_index], y.loc[test_index]
    maxVotingClassifier = VotingClassifier(estimators=[('xgboost', xgbc), ('rgf', rgfc),('catboost', ctb)], voting='soft')
    maxVotingClassifier.fit(xtr, ytr)
    
    predxyz = maxVotingClassifier.predict_proba(xvl)
    predxyz = [x[1] for x in predxyz]
    print(roc_auc_score(yvl, predxyz))
   

1 of KFold 5




0:	learn: 0.6886886	total: 115ms	remaining: 1m 54s
1:	learn: 0.6843838	total: 293ms	remaining: 2m 26s
2:	learn: 0.6801509	total: 408ms	remaining: 2m 15s
3:	learn: 0.6758104	total: 658ms	remaining: 2m 43s
4:	learn: 0.6716200	total: 845ms	remaining: 2m 48s
5:	learn: 0.6676551	total: 1.07s	remaining: 2m 57s
6:	learn: 0.6635953	total: 1.53s	remaining: 3m 37s
7:	learn: 0.6598227	total: 1.7s	remaining: 3m 31s
8:	learn: 0.6560091	total: 1.87s	remaining: 3m 25s
9:	learn: 0.6521839	total: 1.97s	remaining: 3m 14s
10:	learn: 0.6485119	total: 2.09s	remaining: 3m 8s
11:	learn: 0.6449673	total: 2.2s	remaining: 3m 1s
12:	learn: 0.6414377	total: 2.36s	remaining: 2m 58s
13:	learn: 0.6379370	total: 2.62s	remaining: 3m 4s
14:	learn: 0.6348574	total: 2.71s	remaining: 2m 57s
15:	learn: 0.6316161	total: 2.97s	remaining: 3m 2s
16:	learn: 0.6283651	total: 3.38s	remaining: 3m 15s
17:	learn: 0.6251121	total: 3.64s	remaining: 3m 18s
18:	learn: 0.6219812	total: 3.8s	remaining: 3m 16s
19:	learn: 0.6192011	total: 3



0:	learn: 0.6887759	total: 56.8ms	remaining: 56.7s
1:	learn: 0.6844529	total: 98.3ms	remaining: 49s
2:	learn: 0.6801887	total: 137ms	remaining: 45.7s
3:	learn: 0.6759918	total: 179ms	remaining: 44.7s
4:	learn: 0.6719783	total: 220ms	remaining: 43.8s
5:	learn: 0.6678134	total: 264ms	remaining: 43.7s
6:	learn: 0.6642495	total: 295ms	remaining: 41.9s
7:	learn: 0.6605233	total: 336ms	remaining: 41.7s
8:	learn: 0.6569080	total: 379ms	remaining: 41.7s
9:	learn: 0.6533639	total: 420ms	remaining: 41.6s
10:	learn: 0.6496770	total: 460ms	remaining: 41.4s
11:	learn: 0.6466468	total: 493ms	remaining: 40.6s
12:	learn: 0.6432446	total: 535ms	remaining: 40.6s
13:	learn: 0.6398359	total: 578ms	remaining: 40.7s
14:	learn: 0.6367115	total: 618ms	remaining: 40.6s
15:	learn: 0.6333581	total: 661ms	remaining: 40.7s
16:	learn: 0.6303656	total: 699ms	remaining: 40.4s
17:	learn: 0.6272400	total: 788ms	remaining: 43s
18:	learn: 0.6241150	total: 932ms	remaining: 48.1s
19:	learn: 0.6210475	total: 973ms	remaining



0:	learn: 0.6887768	total: 43.6ms	remaining: 43.5s
1:	learn: 0.6845102	total: 97.9ms	remaining: 48.8s
2:	learn: 0.6802895	total: 138ms	remaining: 45.9s
3:	learn: 0.6760750	total: 178ms	remaining: 44.3s
4:	learn: 0.6718825	total: 271ms	remaining: 54s
5:	learn: 0.6679543	total: 424ms	remaining: 1m 10s
6:	learn: 0.6641443	total: 478ms	remaining: 1m 7s
7:	learn: 0.6603008	total: 516ms	remaining: 1m 3s
8:	learn: 0.6564919	total: 559ms	remaining: 1m 1s
9:	learn: 0.6529011	total: 599ms	remaining: 59.3s
10:	learn: 0.6494268	total: 640ms	remaining: 57.6s
11:	learn: 0.6460988	total: 689ms	remaining: 56.7s
12:	learn: 0.6426361	total: 731ms	remaining: 55.5s
13:	learn: 0.6391180	total: 772ms	remaining: 54.3s
14:	learn: 0.6357028	total: 809ms	remaining: 53.1s
15:	learn: 0.6325858	total: 850ms	remaining: 52.3s
16:	learn: 0.6294238	total: 890ms	remaining: 51.5s
17:	learn: 0.6263584	total: 931ms	remaining: 50.8s
18:	learn: 0.6232413	total: 970ms	remaining: 50.1s
19:	learn: 0.6200900	total: 1.01s	remain



0:	learn: 0.6884371	total: 50.1ms	remaining: 50s
1:	learn: 0.6844937	total: 95.9ms	remaining: 47.8s
2:	learn: 0.6802773	total: 171ms	remaining: 56.8s
3:	learn: 0.6760593	total: 215ms	remaining: 53.6s
4:	learn: 0.6719916	total: 282ms	remaining: 56.1s
5:	learn: 0.6680261	total: 335ms	remaining: 55.5s
6:	learn: 0.6640771	total: 389ms	remaining: 55.1s
7:	learn: 0.6603366	total: 463ms	remaining: 57.4s
8:	learn: 0.6566824	total: 588ms	remaining: 1m 4s
9:	learn: 0.6530332	total: 822ms	remaining: 1m 21s
10:	learn: 0.6495117	total: 878ms	remaining: 1m 18s
11:	learn: 0.6459258	total: 945ms	remaining: 1m 17s
12:	learn: 0.6424620	total: 1.03s	remaining: 1m 18s
13:	learn: 0.6389443	total: 1.09s	remaining: 1m 17s
14:	learn: 0.6357381	total: 1.27s	remaining: 1m 23s
15:	learn: 0.6325158	total: 1.36s	remaining: 1m 23s
16:	learn: 0.6292319	total: 1.44s	remaining: 1m 23s
17:	learn: 0.6260471	total: 1.51s	remaining: 1m 22s
18:	learn: 0.6229334	total: 1.58s	remaining: 1m 21s
19:	learn: 0.6200416	total: 1.8



0:	learn: 0.6884209	total: 44.3ms	remaining: 44.3s
1:	learn: 0.6841717	total: 121ms	remaining: 1m
2:	learn: 0.6800023	total: 163ms	remaining: 54.2s
3:	learn: 0.6758179	total: 214ms	remaining: 53.3s
4:	learn: 0.6716806	total: 259ms	remaining: 51.5s
5:	learn: 0.6677169	total: 314ms	remaining: 51.9s
6:	learn: 0.6638342	total: 393ms	remaining: 55.8s
7:	learn: 0.6597498	total: 486ms	remaining: 1m
8:	learn: 0.6561893	total: 652ms	remaining: 1m 11s
9:	learn: 0.6526262	total: 826ms	remaining: 1m 21s
10:	learn: 0.6491537	total: 958ms	remaining: 1m 26s
11:	learn: 0.6458307	total: 1.06s	remaining: 1m 27s
12:	learn: 0.6424306	total: 1.13s	remaining: 1m 26s
13:	learn: 0.6389209	total: 1.19s	remaining: 1m 23s
14:	learn: 0.6355066	total: 1.24s	remaining: 1m 21s
15:	learn: 0.6324096	total: 1.29s	remaining: 1m 19s
16:	learn: 0.6292118	total: 1.36s	remaining: 1m 18s
17:	learn: 0.6261939	total: 1.43s	remaining: 1m 17s
18:	learn: 0.6229104	total: 1.48s	remaining: 1m 16s
19:	learn: 0.6198986	total: 1.56s	r

In [None]:
predy4 = maxVotingClassifier.predict_proba(test)
predy4 = [x[1] for x in predy4]

In [None]:
test4 = pd.read_csv('data/test_data.csv')

test4['Claim'] = pred4

submit = test4[['Customer Id', 'Claim']].copy()

In [45]:
submit.to_csv('data/Submission4.csv', index=False)