# Health Insuranc Lead Prediction

### Import Required Library

In [597]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_columns', None)

### Load Training data

In [598]:
data = pd.read_csv('https://datahack-prod.s3.amazonaws.com/train_file/train_Df64byy.csv')
data.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0


In [599]:
data['Response'].value_counts()

0    38673
1    12209
Name: Response, dtype: int64

Dataset is imballanced but I will use all the data (no down sampling) as we have parameter called scale_pos_weight in xgbClassifier to deal with imballanced data.

### Train , Validation data split

In [600]:
## train validation split
train_df , val_df = train_test_split(data,test_size = 0.2,random_state=42)
print(train_df.shape , val_df.shape)
print(train_df['Response'].value_counts())
print(val_df['Response'].value_counts())

(40705, 14) (10177, 14)
0    30985
1     9720
Name: Response, dtype: int64
0    7688
1    2489
Name: Response, dtype: int64


### Featurization 
#### Featurization : City_Code and Region_Code(1)
* Feature City_Code and Region_Code both identifies the location of the customer. One way to encode these feature is to create a combined variable (cocatination of string) to reduce the number of levels.
* Then Hashing Encoder will be applied to combined feature because this encoder is best choice for large cardinality features.

In [601]:
## combine city code and region code to reduce the number of levels
lst = [str(x)+str(y) for x,y in zip(train_df['City_Code'],train_df['Region_Code']) ]
train_df['Com_Region'] = lst

## perform hash encoding to 'Com_Region'
## n_components = 10 is selected because large value create sparcity and low value will cause collision
hash_encoder = ce.HashingEncoder(cols='Com_Region',n_components=10)
train_df = hash_encoder.fit_transform(train_df)


## hash ending in validation data
lst = [str(x)+str(y) for x,y in zip(val_df['City_Code'],val_df['Region_Code']) ]
val_df['Com_Region'] = lst
val_df =hash_encoder.transform(val_df)

print(train_df.shape , val_df.shape)

(40705, 24) (10177, 24)


In [602]:
len(train_df['City_Code'].unique())

36

#### Featurization : City_Code(2)
* For feature City_Code we have 36 unique levels. One Hot Encoding may cause sparcity on training data.
* One other way to encode such feature is ranking which is based on frequency of there levels. Level which is most frequent is assigned with 1 and 2,3,4... to small frequencies.
* I will also perform merging that means percentage frequencies 2.344 and 2.564 will be merge to single rank as they are quite similar

In [603]:
### train_df City code frequency of each level
x = train_df['City_Code'].value_counts().to_frame()
freq = x['City_Code'].values
total = freq.sum()
x['percent_frequency'] = 100*freq/total
x

Unnamed: 0,City_Code,percent_frequency
C1,7223,17.744749
C2,6157,15.125906
C3,3901,9.583589
C4,2933,7.205503
C9,1751,4.301683
C6,1578,3.876674
C7,1538,3.778406
C8,1455,3.574499
C10,1280,3.144577
C5,1127,2.768702


In [604]:
## Based on percentage frequency value I decided to give 5 different rank
def levelcombine(x):
    lst = []
    for i in x:
        if i>10:
            lst.append(1) #percentage frequency value > 10 => Rank 1 
        elif i <= 10 and i > 3:
            lst.append(2) #percentage frequency value (3 to 10) => Rank 2
        elif i <= 3 and i>2:
            lst.append(3) #percentage frequency value (2 to 3) => Rank 3
        elif i <=2 and i>1 :
            lst.append(4) #percentage frequency value (1 to 2) => Rank 4
        else:
            lst.append(5) #percentage frequency value (0 to 1) => Rank 5
    return lst

x['New_City_Code'] = levelcombine(x['percent_frequency'])
x

Unnamed: 0,City_Code,percent_frequency,New_City_Code
C1,7223,17.744749,1
C2,6157,15.125906,1
C3,3901,9.583589,2
C4,2933,7.205503,2
C9,1751,4.301683,2
C6,1578,3.876674,2
C7,1538,3.778406,2
C8,1455,3.574499,2
C10,1280,3.144577,2
C5,1127,2.768702,3


In [605]:
## Now we have to make a dictionary that maps city_code to rank
city_code_map_dict = {}
for i,j in zip(x.index , x.New_City_Code):
    city_code_map_dict[i] = j
print(city_code_map_dict)    

{'C1': 1, 'C2': 1, 'C3': 2, 'C4': 2, 'C9': 2, 'C6': 2, 'C7': 2, 'C8': 2, 'C10': 2, 'C5': 3, 'C11': 3, 'C17': 3, 'C15': 3, 'C16': 3, 'C13': 4, 'C20': 4, 'C19': 4, 'C12': 4, 'C18': 4, 'C14': 4, 'C21': 4, 'C23': 4, 'C24': 4, 'C22': 4, 'C26': 5, 'C29': 5, 'C25': 5, 'C27': 5, 'C28': 5, 'C33': 5, 'C32': 5, 'C34': 5, 'C30': 5, 'C35': 5, 'C36': 5, 'C31': 5}


In [606]:
## we will make new column 'New_City_Code' which is rank based on frequency
train_df['New_City_Code'] = [city_code_map_dict[x] for x in train_df['City_Code'].values]
val_df['New_City_Code'] = [city_code_map_dict[x] for x in val_df['City_Code'].values]
train_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code
0,0,0,0,1,0,0,0,0,0,0,34370,C13,946,Owned,Individual,71,71,No,,,,2,15522.0,0,4
1,0,0,0,0,1,0,0,0,0,0,5263,C2,23,Owned,Individual,28,28,No,X1,2.0,3.0,3,10262.0,0,1


#### Featurization : City_Code(3)
* There is one more option to encode the feature City_Code which is based on likelyhood of its levels.
* We will calculate p(Response=1/City_code=level) for each level and assign this value as new feature.
* We can do the same for p(Response=0/City_code=level). I tried this and it was highly correlated to p(Response=1/City_code=level). So we don't use this feature for Response=0.

In [607]:
#### City_Code Response Coding 
train_df_R1 = train_df[train_df['Response']==1]
train_df_R0 = train_df[train_df['Response']==0]

x = train_df_R1['City_Code'].value_counts().to_frame() 
city_code_freq1 = x['City_Code'].values  # freq for each level
total = city_code_freq1.sum() # total entries in training data
x['R1_prob'] = city_code_freq1/total  # liklyhood of each levels
x

Unnamed: 0,City_Code,R1_prob
C1,1769,0.181996
C2,1485,0.152778
C3,912,0.093827
C4,711,0.073148
C9,418,0.043004
C7,392,0.040329
C6,375,0.03858
C8,354,0.03642
C10,286,0.029424
C5,261,0.026852


In [608]:
## We need create a dictionary to map City_Code to its likelyhood value
city_code_R1_prob_dict = {}
for i,j in zip(x.index , x.R1_prob):
    city_code_R1_prob_dict[i] = j
print(city_code_R1_prob_dict)

{'C1': 0.18199588477366255, 'C2': 0.1527777777777778, 'C3': 0.09382716049382717, 'C4': 0.07314814814814814, 'C9': 0.04300411522633745, 'C7': 0.040329218106995884, 'C6': 0.038580246913580245, 'C8': 0.03641975308641975, 'C10': 0.0294238683127572, 'C5': 0.026851851851851852, 'C11': 0.023045267489711935, 'C16': 0.022016460905349793, 'C17': 0.02119341563786008, 'C15': 0.020061728395061727, 'C13': 0.019753086419753086, 'C20': 0.01676954732510288, 'C19': 0.01656378600823045, 'C18': 0.016152263374485595, 'C12': 0.016049382716049384, 'C14': 0.014814814814814815, 'C23': 0.013374485596707819, 'C21': 0.012757201646090535, 'C22': 0.010493827160493827, 'C24': 0.010390946502057612, 'C26': 0.008539094650205761, 'C29': 0.008127572016460905, 'C25': 0.006584362139917695, 'C27': 0.006172839506172839, 'C33': 0.005967078189300412, 'C28': 0.005864197530864198, 'C32': 0.0030864197530864196, 'C34': 0.002880658436213992, 'C30': 0.001337448559670782, 'C35': 0.0011316872427983538, 'C36': 0.00041152263374485596, '

In [609]:
## creating new Feature by maping City_Code to its corresponding likelyhood value 

train_df['R1_prob'] = [city_code_R1_prob_dict[x] for x in train_df['City_Code'].values]
val_df['R1_prob'] = [city_code_R1_prob_dict[x] for x in val_df['City_Code'].values]
train_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob
0,0,0,0,1,0,0,0,0,0,0,34370,C13,946,Owned,Individual,71,71,No,,,,2,15522.0,0,4,0.019753
1,0,0,0,0,1,0,0,0,0,0,5263,C2,23,Owned,Individual,28,28,No,X1,2.0,3.0,3,10262.0,0,1,0.152778


#### Featurization : Region_Code
* Featurization Region_Code is little tricky as it has large cardinality.
* We have already used hash Encoding for combined (City_Code, Region_Code) feature so here I will use encoding based on frequency. I will simply use percentage freq and multyply it by 100 as these values are very small.

In [610]:
### train_df Region_Code frequency of each level
x = train_df['Region_Code'].value_counts().to_frame()

freq = x['Region_Code'].values # freq for each level
total = freq.sum()  # total frequ
x['percent_frequency'] = 100*freq/total # percentage frequency
x.head(10)

Unnamed: 0,Region_Code,percent_frequency
1,81,0.198993
6,61,0.149859
2,60,0.147402
4,56,0.137575
10,56,0.137575
5,55,0.135119
7,53,0.130205
8,49,0.120378
20,47,0.115465
17,46,0.113008


In [611]:
## Now we have to create a dict to map Region_Code to its freq
Region_code_per_frq_dict = {}
for i,j in zip(x.index , x.percent_frequency):
    Region_code_per_frq_dict[i] = j*100 # multipling with 100 as it is very small value

# creating new feature by mapping Region_Code with its freq value    
train_df['Region_Code_freq'] = [Region_code_per_frq_dict[x] for x in train_df['Region_Code'].values]

# creating new feature by mapping Region_Code with its freq value for validation data
temp = []
for i in val_df['Region_Code'].values:
    if i in Region_code_per_frq_dict.keys():
        temp.append(Region_code_per_frq_dict[i])
    else :
        temp.append(0)

val_df['Region_Code_freq'] = temp
val_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob,Region_Code_freq
0,0,1,0,0,0,0,0,0,0,0,19809,C4,3283,Rented,Individual,25,25,No,X2,,,14,7068.0,0,2,0.073148,1.22835
1,0,1,0,0,0,0,0,0,0,0,17701,C4,1308,Rented,Individual,23,23,No,X1,,,22,4080.0,0,2,0.073148,2.948041
2,0,0,1,0,0,0,0,0,0,0,24040,C2,168,Rented,Joint,26,24,Yes,X2,1.0,3.0,13,16486.4,0,1,0.152778,2.948041
3,0,1,0,0,0,0,0,0,0,0,23030,C2,256,Rented,Individual,54,54,No,X4,11.0,1.0,22,19214.0,1,1,0.152778,4.667731
4,0,0,0,1,0,0,0,0,0,0,44333,C7,4006,Owned,Individual,75,75,No,X3,3.0,3.0,21,20376.0,0,2,0.040329,0.73701


#### Featurization : Accomodation_Type
* This feature has only two level so will use simply label encoder.

In [612]:
### Label encoder 'Accomodation_Type'
le_Accomodation_Type = LabelEncoder()
encoded_Accomodation_Type = le_Accomodation_Type.fit_transform(train_df['Accomodation_Type'].values)

# Create new label encoded feature for Accomodation_Type
train_df['encoded_Accomodation_Type'] = encoded_Accomodation_Type

# Create new label encoded feature for Accomodation_Type for validation data
encoded_Accomodation_Type_val = le_Accomodation_Type.transform(val_df['Accomodation_Type'].values)
val_df['encoded_Accomodation_Type'] = encoded_Accomodation_Type_val

train_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob,Region_Code_freq,encoded_Accomodation_Type
0,0,0,0,1,0,0,0,0,0,0,34370,C13,946,Owned,Individual,71,71,No,,,,2,15522.0,0,4,0.019753,4.176391,0
1,0,0,0,0,1,0,0,0,0,0,5263,C2,23,Owned,Individual,28,28,No,X1,2.0,3.0,3,10262.0,0,1,0.152778,10.072473,0


#### Featurization : Reco_Insurance_Type
* This feature has only two level so will use simply label encoder.

In [613]:
### Label encoder 'Reco_Insurance_Type'
le_Reco_Insurance_Type = LabelEncoder()
encoded_Reco_Insurance_Type = le_Reco_Insurance_Type.fit_transform(train_df['Reco_Insurance_Type'].values)

# Create new label encoded feature for Reco_Insurance_Type
train_df['encoded_Reco_Insurance_Type'] = encoded_Reco_Insurance_Type

# Create new label encoded feature for Reco_Insurance_Type for validation data
encoded_Reco_Insurance_Type_val = le_Reco_Insurance_Type.transform(val_df['Reco_Insurance_Type'].values)
val_df['encoded_Reco_Insurance_Type'] = encoded_Reco_Insurance_Type_val

train_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob,Region_Code_freq,encoded_Accomodation_Type,encoded_Reco_Insurance_Type
0,0,0,0,1,0,0,0,0,0,0,34370,C13,946,Owned,Individual,71,71,No,,,,2,15522.0,0,4,0.019753,4.176391,0,0
1,0,0,0,0,1,0,0,0,0,0,5263,C2,23,Owned,Individual,28,28,No,X1,2.0,3.0,3,10262.0,0,1,0.152778,10.072473,0,0


#### Featurization : Is_Spouse
* This feature has only two level so will use simply label encoder.

In [614]:
### Label encoder 'Is_Spouse'
le_Is_Spouse = LabelEncoder()
encoded_Is_Spouse = le_Is_Spouse.fit_transform(train_df['Is_Spouse'].values)

# Create new label encoded feature for Is_Spouse
train_df['encoded_Is_Spouse'] = encoded_Is_Spouse

# Create new label encoded feature for Is_Spouse for validation data
encoded_Is_Spouse_val = le_Is_Spouse.transform(val_df['Is_Spouse'].values)
val_df['encoded_Is_Spouse'] = encoded_Is_Spouse_val

train_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob,Region_Code_freq,encoded_Accomodation_Type,encoded_Reco_Insurance_Type,encoded_Is_Spouse
0,0,0,0,1,0,0,0,0,0,0,34370,C13,946,Owned,Individual,71,71,No,,,,2,15522.0,0,4,0.019753,4.176391,0,0,0
1,0,0,0,0,1,0,0,0,0,0,5263,C2,23,Owned,Individual,28,28,No,X1,2.0,3.0,3,10262.0,0,1,0.152778,10.072473,0,0,0


#### Information Gain 
* I suspect that above three binary label encoded feature is not helpful for classification. To make sure let's quickly calculate information gain. 
* Information gain is the measure of mutual information between dependent and independent variable. High information gain tell high dependency and high mutual information.

In [615]:
# lets check information gain of Response with respect to binary feature 
# 'encoded_Accomodation_Type'  , 'encoded_Reco_Insurance_Type' , 'encoded_Is_Spouse'
independent_feature = train_df['encoded_Accomodation_Type'].values.reshape(-1,1)
dependent_feature = train_df['Response'].values.reshape(-1,1)
score = mutual_info_classif(independent_feature , dependent_feature)
print(f'mutual information between feature Accomodation_Type and Response : {score}')

###################
independent_feature = train_df['encoded_Reco_Insurance_Type'].values.reshape(-1,1)
score = mutual_info_classif(independent_feature , dependent_feature)
print(f'mutual information between feature Reco_Insurance_Type and Response : {score}')

##################
independent_feature = train_df['encoded_Is_Spouse'].values.reshape(-1,1)
score = mutual_info_classif(independent_feature , dependent_feature)
print(f'mutual information between feature Is_Spouse and Response : {score}')

mutual information between feature Accomodation_Type and Response : [7.44229723e-05]
mutual information between feature Reco_Insurance_Type and Response : [0]
mutual information between feature Is_Spouse and Response : [0]


The information gain is very small means it may not very helpfull for classification.

#### Null value imputation
* Feature 'Health Indicator' , 'Holding_Policy_Duration' and 'Holding_Policy_Type' containd null value which is imputed using KNN. 

In [616]:
### built knn1 for 'Health Indicator'
##prepare data_set
removable_columns  = ['ID', 'City_Code','Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
                      'Is_Spouse','Holding_Policy_Duration','Holding_Policy_Type']
data = train_df.drop(removable_columns , axis = 1)
train_data = data[data['Health Indicator'].notnull()] # data without null value 
test_data = data[data['Health Indicator'].isnull()] # data with null value (test data)

#prepare knn trainable data
y_train_knn = train_data['Health Indicator'].values
train_data = train_data.drop(['Health Indicator'],axis = 1)

# converting string into levels
leknn1 = LabelEncoder()
y_train = leknn1.fit_transform(y_train_knn)

# train knn classifier
knn_health_indicator = KNeighborsClassifier()
knn_health_indicator.fit(train_data.values,y_train)

# Now predict missing value for test data
test_data = test_data.drop(['Health Indicator'],axis = 1)
classes = knn_health_indicator.predict(test_data.values)

test_data['Health Indicator'] = leknn1.inverse_transform(classes)



In [617]:
# imputation on traing data
# this helper fuction will impute missing value with prediced value
for i in test_data.index:
    try:
        impute = test_data.loc[i]['Health Indicator']
        train_df.loc[i ,'Health Indicator'] = impute
    except:
        pass

In [618]:
##imputation for validation data
knn1_val = val_df.drop(removable_columns , axis = 1)

test_data = knn1_val[knn1_val['Health Indicator'].isnull()]

test_data = test_data.drop(['Health Indicator'],axis=1)

classes = knn_health_indicator.predict(test_data.values)

test_data['Health Indicator'] = leknn1.inverse_transform(classes)


for i in test_data.index:
    try:
        impute = test_data.loc[i]['Health Indicator']
        val_df.loc[i ,'Health Indicator'] = impute
    except:
        pass

Since we have imputes null value now we will perform One Hot Encoding for feature 'Health Indicator'

In [619]:
### One hot encoding for Health Idicatore

train_df = train_df.reset_index(drop=True)
HI_ohe = OneHotEncoder(sparse=False)

encoded = HI_ohe.fit_transform(train_df['Health Indicator'].values.reshape(-1,1))
temp_df = pd.DataFrame(encoded , columns=HI_ohe.categories_).reset_index(drop=True)
train_df = pd.concat([train_df,temp_df],axis = 1)
train_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob,Region_Code_freq,encoded_Accomodation_Type,encoded_Reco_Insurance_Type,encoded_Is_Spouse,"(X1,)","(X2,)","(X3,)","(X4,)","(X5,)","(X6,)","(X7,)","(X8,)","(X9,)"
0,0,0,0,1,0,0,0,0,0,0,34370,C13,946,Owned,Individual,71,71,No,X1,,,2,15522.0,0,4,0.019753,4.176391,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,1,0,0,0,0,0,5263,C2,23,Owned,Individual,28,28,No,X1,2.0,3.0,3,10262.0,0,1,0.152778,10.072473,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [620]:
## One hot encoding for Health Idicatore for validation
encoded_val = HI_ohe.transform(val_df['Health Indicator'].values.reshape(-1,1))
temp_df = pd.DataFrame(encoded_val , columns=HI_ohe.categories_).reset_index(drop=True)
val_df = pd.concat([val_df,temp_df],axis = 1)
val_df.head(2)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response,New_City_Code,R1_prob,Region_Code_freq,encoded_Accomodation_Type,encoded_Reco_Insurance_Type,encoded_Is_Spouse,"(X1,)","(X2,)","(X3,)","(X4,)","(X5,)","(X6,)","(X7,)","(X8,)","(X9,)"
0,0,1,0,0,0,0,0,0,0,0,19809,C4,3283,Rented,Individual,25,25,No,X2,,,14,7068.0,0,2,0.073148,1.22835,1,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0,0,0,0,0,0,0,0,17701,C4,1308,Rented,Individual,23,23,No,X1,,,22,4080.0,0,2,0.073148,2.948041,1,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Missing value imputation for feature 'Holding_Policy_Duration'


In [621]:
## first we will will replace 14+ with 15
x = train_df[train_df['Holding_Policy_Duration']=='14+'].index
for i in x:
    train_df.loc[i , 'Holding_Policy_Duration'] = 15

In [622]:
### built knn 2 Holding policy duration
##prepare data_set
removable_columns  = ['ID', 'City_Code','Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
                      'Is_Spouse','Health Indicator','Holding_Policy_Type']
data = train_df.drop(removable_columns , axis = 1)

train_data = data[data['Holding_Policy_Duration'].notnull()]
test_data = data[data['Holding_Policy_Duration'].isnull()]

train = train_data.drop(['Holding_Policy_Duration'],axis=1).values
y_train = train_data['Holding_Policy_Duration'].values
y_train = y_train.astype(np.float)
knn2 = KNeighborsRegressor()

knn2.fit(train,y_train)


KNeighborsRegressor()

In [623]:
## here we will predict missing data
test = test_data.drop(['Holding_Policy_Duration'],axis=1).values
classes = knn2.predict(test)
classes = np.floor(classes)
test_data['Holding_Policy_Duration'] = classes

In [625]:
## impute predicted data onto missing data
for i in test_data.index:
    try:
        impute = test_data.loc[i]['Holding_Policy_Duration']
        train_df.loc[i , 'Holding_Policy_Duration'] = impute
    except:
        pass

In [626]:
### imputing validation data Holding_Policy_Duration for validation data
## 14+ to 15

x = val_df[val_df['Holding_Policy_Duration']=='14+'].index
for i in x:
    val_df.loc[i ,'Holding_Policy_Duration'] = 15
    
removable_columns  = ['ID', 'City_Code','Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
                      'Is_Spouse','Health Indicator','Holding_Policy_Type']
data = val_df.drop(removable_columns , axis = 1)

## here we will predict missing data
train_data = data[data['Holding_Policy_Duration'].notnull()]
test_data = data[data['Holding_Policy_Duration'].isnull()]

test = test_data.drop(['Holding_Policy_Duration'],axis=1).values


In [628]:
classes = knn2.predict(test)
classes = np.floor(classes)

test_data['Holding_Policy_Duration'] = classes


## impute predicted data onto missing data
for i in test_data.index:
    try:
        impute = test_data.loc[i]['Holding_Policy_Duration']
        val_df.loc[i , 'Holding_Policy_Duration'] = impute
    except:
        pass

#### Missing value imputation for feature 'Holding_Policy_Type'

In [629]:
### built knn 3 Holding policy type
##prepare data_set
removable_columns  = ['ID', 'City_Code','Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
                      'Is_Spouse','Health Indicator']
data = train_df.drop(removable_columns , axis = 1)

train_data = data[data['Holding_Policy_Type'].notnull()]
test_data = data[data['Holding_Policy_Type'].isnull()]


train = train_data.drop(['Holding_Policy_Type'],axis=1).values
y_train = train_data['Holding_Policy_Type'].values
y_train = y_train.astype(np.float)
knn3 = KNeighborsRegressor()

knn3.fit(train,y_train)

test = test_data.drop(['Holding_Policy_Type'],axis=1).values
classes = knn3.predict(test)
classes = np.floor(classes)
test_data['Holding_Policy_Type'] = classes

In [630]:
for i in test_data.index:
    try:
        impute = test_data.loc[i]['Holding_Policy_Type']
        train_df.loc[i , 'Holding_Policy_Type'] = impute
    except:
        pass

In [631]:
### imputing validation data Holding_Policy_Type for validation data
removable_columns  = ['ID', 'City_Code','Region_Code', 'Accomodation_Type', 'Reco_Insurance_Type',
                      'Is_Spouse','Health Indicator']
data = val_df.drop(removable_columns , axis = 1)
train_data = data[data['Holding_Policy_Type'].notnull()]
test_data = data[data['Holding_Policy_Type'].isnull()]

# pedict missing value for validation data
test = test_data.drop(['Holding_Policy_Type'],axis=1).values
classes = knn3.predict(test)
classes = np.floor(classes)

test_data['Holding_Policy_Type'] = classes

In [632]:
## impute predicted value on validation data
for i in test_data.index:
    try:
        impute = test_data.loc[i]['Holding_Policy_Type']
        val_df.loc[i , 'Holding_Policy_Type'] = impute
    except:
        pass

### Training model

In [633]:
## Prepare trainable data and classes
removable_columns = ['ID','City_Code','Region_Code','Accomodation_Type','Reco_Insurance_Type',
                    'Is_Spouse','Health Indicator','Response']
X_train = train_df.drop(removable_columns,axis = 1).values
y_train = train_df['Response'].values

X_val = val_df.drop(removable_columns,axis=1).values
y_val = val_df['Response'].values

print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)

(40705, 31) (10177, 31) (40705,) (10177,)


In [635]:
## Training xgbClassifier

start = time.time()
params = {'max_depth':[5,6,7,9,11,13],
          'min_child_weight':[0.001,0.01,0.1,1,2,3],
          'n_estimators':[10,20,50,100,200,500]}
#scale_pos_weight=3.187 comes from sum(negative instances) / sum(positive instances)
model  = xgb.XGBClassifier(booster='gbtree',scale_pos_weight=3.187)
grid = RandomizedSearchCV(model, param_distributions=params, scoring ='roc_auc', \
                    cv=5,return_train_score=True) 
                                                
grid.fit(X_train,y_train) 
print('time taken to train the model in sec:',time.time() - start)

time taken to train the model in sec: 145.70877313613892


In [636]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=9,
              min_child_weight=0.1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3.187, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [637]:
xgb_model = grid.best_estimator_
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=9,
              min_child_weight=0.1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3.187, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [638]:
actual_label = y_train
predicted_label = xgb_model.predict(X_train)
print('roc_auc score for training data is :',roc_auc_score(actual_label,predicted_label))

actual_label_val = y_val
predicted_label_val = xgb_model.predict(X_val)
print('roc_auc score for cross validation data is :',roc_auc_score(actual_label_val,predicted_label_val))

roc_auc score for training data is : 0.7345439117958975
roc_auc score for cross validation data is : 0.6026797043306886


### Predict on test data

In [641]:
test_df = pd.read_csv('https://datahack-prod.s3.amazonaws.com/test_file/test_YCcRUnU.csv')
test_df['Response'] = np.zeros(len(test_df))

In [644]:
lst = [str(x)+str(y) for x,y in zip(test_df['City_Code'],test_df['Region_Code']) ]
test_df['Com_Region'] = lst
test_df = hash_encoder.transform(test_df)

In [645]:
test_df['New_City_Code'] = [city_code_map_dict[x] for x in test_df['City_Code'].values]

In [646]:
test_df['R1_prob'] = [city_code_R1_prob_dict[x] for x in test_df['City_Code'].values]

In [647]:
test_df['Region_Code_freq'] = [Region_code_per_frq_dict[x] for x in test_df['Region_Code'].values]

KeyError: 5232