In [52]:
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [53]:
df = pd.read_csv('train_Df64byy.csv')
del df['ID']
df.head()

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0
1,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0
2,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1
3,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0
4,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0


In [54]:
df.shape

(50882, 13)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50882 entries, 0 to 50881
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   City_Code                50882 non-null  object 
 1   Region_Code              50882 non-null  int64  
 2   Accomodation_Type        50882 non-null  object 
 3   Reco_Insurance_Type      50882 non-null  object 
 4   Upper_Age                50882 non-null  int64  
 5   Lower_Age                50882 non-null  int64  
 6   Is_Spouse                50882 non-null  object 
 7   Health Indicator         39191 non-null  object 
 8   Holding_Policy_Duration  30631 non-null  object 
 9   Holding_Policy_Type      30631 non-null  float64
 10  Reco_Policy_Cat          50882 non-null  int64  
 11  Reco_Policy_Premium      50882 non-null  float64
 12  Response                 50882 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 5.0+ MB


In [56]:
#checking null values
df.isnull().sum() / len(df) * 100.0

City_Code                   0.000000
Region_Code                 0.000000
Accomodation_Type           0.000000
Reco_Insurance_Type         0.000000
Upper_Age                   0.000000
Lower_Age                   0.000000
Is_Spouse                   0.000000
Health Indicator           22.976691
Holding_Policy_Duration    39.799929
Holding_Policy_Type        39.799929
Reco_Policy_Cat             0.000000
Reco_Policy_Premium         0.000000
Response                    0.000000
dtype: float64

In [57]:
df.columns

Index(['City_Code', 'Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
       'Upper_Age', 'Lower_Age', 'Is_Spouse', 'Health Indicator',
       'Holding_Policy_Duration', 'Holding_Policy_Type', 'Reco_Policy_Cat',
       'Reco_Policy_Premium', 'Response'],
      dtype='object')

In [58]:
dftest = pd.read_csv('test_YCcRUnU.csv')
dftest.shape

(21805, 13)

## Data Cleaning and Preparation

In [59]:
#imputing null values
df['Health Indicator'].fillna(df['Health Indicator'].mode()[0], inplace = True)

df.Holding_Policy_Duration = df.Holding_Policy_Duration.apply(lambda x: float(x.replace('+', '')) if type(x) == str else x)
df.Holding_Policy_Duration.fillna(df.Holding_Policy_Duration.describe()['50%'], inplace= True)

df.Holding_Policy_Type.fillna(df.Holding_Policy_Type.mode()[0], inplace = True)

In [60]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [61]:
#creating a copy to work on
dftrain = df.copy()

In [62]:
# Label encoding for 'Accomodation_Type', 'Reco_Insurance_Type' and 'Is_Spouse'
le_accomodation = LabelEncoder() #'Individual' == 0, 'Joint' == 1
le_reco = LabelEncoder() #'Owned' == 0, 'Rented' == 1
le_spouse = LabelEncoder() #'No' == 0, 'Yes' == 1
le1 = LabelEncoder()
le2 = LabelEncoder()
dftrain.Accomodation_Type = le_accomodation.fit_transform(dftrain.Accomodation_Type)
dftrain.Reco_Insurance_Type = le_reco.fit_transform(dftrain.Reco_Insurance_Type)
dftrain.Is_Spouse = le_spouse.fit_transform(dftrain.Is_Spouse)
dftrain.City_Code = le1.fit_transform(dftrain.City_Code)
dftrain['Health Indicator'] = le2.fit_transform(dftrain['Health Indicator'])
dftrain.head(5)

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,22,3213,1,0,36,36,0,0,14.0,3.0,22,11628.0,0
1,31,1117,0,1,75,22,0,1,5.0,3.0,22,30510.0,0
2,31,3732,0,0,32,32,0,0,1.0,1.0,19,7450.0,1
3,16,4378,0,1,52,48,0,0,14.0,3.0,19,17780.0,0
4,34,2190,1,0,44,44,0,1,3.0,1.0,16,10404.0,0


In [63]:
#Numerical scaling
numcols = ['Holding_Policy_Duration', 'Reco_Policy_Premium', 'Upper_Age', 'Lower_Age']
scaler = StandardScaler()
dftrain[numcols] = scaler.fit_transform(dftrain[numcols])
dftrain.head()

Unnamed: 0,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,22,3213,1,0,-0.511625,-0.389098,0,0,2.377858,3.0,22,-0.387852,0
1,31,1117,0,1,1.741395,-1.197449,0,1,-0.173242,3.0,22,2.477394,0
2,31,3732,0,0,-0.742704,-0.620055,0,0,-1.307064,1.0,19,-1.021842,1
3,16,4378,0,1,0.412691,0.303775,0,0,2.377858,3.0,19,0.545682,0
4,34,2190,1,0,-0.049467,0.072817,0,1,-0.740153,1.0,16,-0.573588,0


In [64]:
y = dftrain.Response
X = dftrain.drop('Response', axis = 1)

In [65]:
X = X[['Region_Code', 'City_Code','Upper_Age', 'Is_Spouse', 'Reco_Policy_Cat',
       'Holding_Policy_Duration']]

In [66]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size= 0.7, random_state= 10)

## Data Modeling

### KNN

In [67]:
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

In [68]:
knn = KNeighborsClassifier(n_neighbors=2, weights='distance', algorithm= 'kd_tree')

In [69]:
knn.fit(Xtrain, ytrain)

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=2, weights='distance')

In [70]:
roc_auc_score(ytrain, knn.predict_proba(Xtrain)[:,1])

0.9999340315145988

In [71]:
roc_auc_score(ytest, knn.predict_proba(Xtest)[:,1])

0.6300133264050165

### Decision Tree

In [72]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='gini',max_depth=20)

In [73]:
dt.fit(Xtrain, ytrain)
roc_auc_score(ytrain, dt.predict_proba(Xtrain)[:,1])

0.943366531531381

In [74]:
roc_auc_score(ytest, dt.predict_proba(Xtest)[:,1])

0.5943346518344133

### Bagging using Decision Tree

In [75]:
from sklearn.ensemble import BaggingClassifier

In [76]:
bagg = BaggingClassifier(base_estimator= dt, n_estimators=100, random_state= 1)
bagg.fit(Xtrain, ytrain)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=20),
                  n_estimators=100, random_state=1)

In [77]:
roc_auc_score(ytrain, bagg.predict_proba(Xtrain)[:,1])

0.9973978874239968

In [78]:
roc_auc_score(ytest, bagg.predict_proba(Xtest)[:,1])

0.6779331582401604

## Testing the model using test data

In [79]:
dftest = pd.read_csv('test_YCcRUnU.csv')
dftest.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883,C1,156,Owned,Individual,30,30,No,,6.0,3.0,5,11934.0
1,50884,C4,7,Owned,Joint,69,68,Yes,X1,3.0,3.0,18,32204.8
2,50885,C1,564,Rented,Individual,28,28,No,X3,2.0,4.0,17,9240.0
3,50886,C3,1177,Rented,Individual,23,23,No,X3,3.0,3.0,18,9086.0
4,50887,C1,951,Owned,Individual,75,75,No,X3,,,5,22534.0


In [80]:
dftest.shape

(21805, 13)

In [81]:
dftest['Health Indicator'].fillna(df['Health Indicator'].value_counts().index[0], inplace = True)
dftest.Holding_Policy_Duration = dftest.Holding_Policy_Duration.apply(lambda x: float(x.replace('+', '')) if type(x) == str else x)
dftest.Holding_Policy_Duration.fillna(df.Holding_Policy_Duration.describe()['50%'], inplace= True)
dftest.Holding_Policy_Type.fillna(df.Holding_Policy_Type.value_counts().index[0], inplace = True)
dftest.Accomodation_Type = le_accomodation.transform(dftest.Accomodation_Type)
dftest.Reco_Insurance_Type = le_reco.transform(dftest.Reco_Insurance_Type)
dftest.Is_Spouse = le_spouse.transform(dftest.Is_Spouse)
dftest.City_Code = le1.transform(dftest.City_Code)
dftest['Health Indicator'] = le2.transform(dftest['Health Indicator'])
dftest[['Holding_Policy_Duration', 'Reco_Policy_Premium', 'Upper_Age', 'Lower_Age']] = scaler.transform(dftest[['Holding_Policy_Duration', 'Reco_Policy_Premium', 'Upper_Age', 'Lower_Age']])
dftest.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium
0,50883,0,156,0,0,-0.858243,-0.735534,0,0,0.110214,3.0,5,-0.341418
1,50884,30,7,0,1,1.394777,1.458562,1,0,-0.740153,3.0,18,2.734571
2,50885,0,564,1,0,-0.973783,-0.851013,0,2,-1.023609,4.0,17,-0.750219
3,50886,22,1177,1,0,-1.262631,-1.13971,0,2,-0.740153,3.0,18,-0.773588
4,50887,0,951,0,0,1.741395,1.862738,0,2,-0.173242,3.0,5,1.267077


In [82]:
dftest['regions_encoded'] = regionLe.transform(dftest.Region_Code)

In [83]:
#testing the data
dftest['Response'] = bagg.predict_proba(dftest[X.columns])[:,1]

In [84]:
dftest[['ID', 'Response']].to_csv('Final Submission.csv', index= False)