## Initiation

In [1]:
#importing all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Data


In [3]:
#uploading train and test files to google colab
from google.colab import files
uploaded = files.upload()

Saving train_Df64byy.csv to train_Df64byy.csv
Saving test_YCcRUnU.csv to test_YCcRUnU.csv


In [62]:
#storing train and test data in respective dataframes
train = pd.read_csv(r'./train_Df64byy.csv')
test = pd.read_csv(r'./test_YCcRUnU.csv')
X = train.drop(['Response','ID'],axis=1)
y = train['Response']

# Preprocessing

In [49]:
X.isnull().sum().to_frame().sort_values(0).tail(3).rename(columns={0:'Count of features\' missing values'})   #count of missing features

Unnamed: 0,Count of features' missing values
Health Indicator,11691
Holding_Policy_Duration,20251
Holding_Policy_Type,20251


In [50]:
X['Holding_Policy_Duration'].fillna(0, inplace=True)  #Setting duration to 0 i.e. no policy
X.Holding_Policy_Type.fillna(0, inplace=True)         #Setting the policy type to 0 where it is not available

In [51]:
#creating new features
X['Is_Child'] = (X['Upper_Age'] != X['Lower_Age'])
X['Is_Child'] = X['Is_Child'] ^ X['Is_Spouse'] 
X['Affluent'] = (X['Accomodation_Type']=='Owned') & (X['Reco_Insurance_Type']=='Individual')
X['Thrifty'] = (X['Accomodation_Type']=='Rented') & (X['Reco_Insurance_Type']=='Joint')
X.Is_Child = X.Is_Child.astype('int32')
X.Affluent = X.Affluent.astype('int32')
X.Thrifty = X.Thrifty.astype('int32')

We need to convert the categorical to numerical data before using KNNImputer

In [52]:
#converting categorical to numeric data
X.Accomodation_Type = X.Accomodation_Type.replace({'Owned':0,'Rented':1})
X.Reco_Insurance_Type = X.Reco_Insurance_Type.replace({'Joint':0,'Individual':1})
X.Median_Age = (X['Lower_Age']+X['Upper_Age'])/2
e1 = LabelEncoder()
X['City_Code'] = e1.fit_transform(X.City_Code)
X.Holding_Policy_Duration = X.Holding_Policy_Duration.replace({'14+':'15'})
X['Health Indicator'] = X['Health Indicator'].str.lstrip('X')
X.Is_Spouse = X.Is_Spouse.replace({'No':0,'Yes':1})

X['Health Indicator'].replace({0:np.nan}, inplace=True)

  after removing the cwd from sys.path.


In [54]:
#Binning the premium and policy age data as they can be converted into ordinal data
bins= [0,5,10,14,17]
X["Hold_duration"] = pd.cut(X.Holding_Policy_Duration.astype('float'),bins,labels=np.arange(1,5), include_lowest=True)
X["Policy prem"] = pd.cut(X.Reco_Policy_Premium,bins=np.linspace(min(X.Reco_Policy_Premium),max(X.Reco_Policy_Premium),20),labels=np.arange(1,20), include_lowest=True)
X.drop(['Reco_Policy_Premium','Holding_Policy_Duration'], axis=1, inplace=True)

NaN values are only remaining in the 'Health Indicator' feauture. I will now apply KNNImputer to transform based on all other features and 5 neighbors

In [55]:
#Imputing missing data in Health Indicator
imputer = KNNImputer(n_neighbors=5, weights='distance', metric='nan_euclidean')
imputer.fit(X)
Xtrans = imputer.transform(X)

# Machine learning model

In [56]:
model = RandomForestClassifier(n_estimators=101, verbose=0, max_depth=16, class_weight='balanced_subsample',criterion='gini', max_features= 15)
model.fit(Xtrans,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=16, max_features=15, max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=101, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

# Applying preprocessing to test set and making predictions

In [63]:
test['Holding_Policy_Duration'].fillna('0', inplace=True)  #Setting duration to 0 i.e. no policy
test.Holding_Policy_Type.fillna('0', inplace=True)         #Setting the policy type to 0 where it is not available
#Creating new features
id=test['ID']
test.drop(['ID'], axis=1,inplace=True)
test['Is_Child'] = (test['Upper_Age'] != test['Lower_Age'])
test['Is_Child'] = test['Is_Child'] ^ test['Is_Spouse'] 
test['Affluent'] = (test['Accomodation_Type']=='Owned') & (test['Reco_Insurance_Type']=='Individual')
test['Thrifty'] = (test['Accomodation_Type']=='Rented') & (test['Reco_Insurance_Type']=='Joint')

test.Accomodation_Type = test.Accomodation_Type.replace({'Owned':0,'Rented':1})
test.Reco_Insurance_Type = test.Reco_Insurance_Type.replace({'Joint':0,'Individual':1})
test.Median_Age = (test['Lower_Age']+test['Upper_Age'])/2
e2 = LabelEncoder()
test['City_Code'] = e2.fit_transform(test.City_Code)
test.Holding_Policy_Duration = test.Holding_Policy_Duration.replace({'14+':'15'})
test['Health Indicator'] = test['Health Indicator'].str.lstrip('X')
test.Is_Spouse = test.Is_Spouse.replace({'No':0,'Yes':1})
test.Is_Child = test.Is_Child.astype('int32')
test.Affluent = test.Affluent.astype('int32')
test.Thrifty = test.Thrifty.astype('int32')

test['Health Indicator'].replace({0:np.nan}, inplace=True)

bins= [0,5,10,14,17]
test["Hold_duration"] = pd.cut(test.Holding_Policy_Duration.astype('float'),bins,labels=np.arange(1,5), include_lowest=True)
test["Policy prem"] = pd.cut(test.Reco_Policy_Premium,bins=np.linspace(2280,43350.4,20),labels=np.arange(1,20), include_lowest=True)
test.drop(['Reco_Policy_Premium','Holding_Policy_Duration'], axis=1, inplace=True)


imputer = KNNImputer(n_neighbors=5, weights='distance', metric='nan_euclidean')
imputer.fit(test)
testtrans = imputer.transform(test)

  del sys.path[0]


In [64]:
y_pr = model.predict_proba(testtrans)  #Predicting the class probabilities

In [65]:
sub = pd.concat([pd.DataFrame(id),pd.DataFrame(y_pr[:,1])],axis=1)   #Keeping probabilities of class 1
sub.columns=['ID','Response']  #Naming the columns
sub.to_csv('./output.csv', index=False) #Writing the output to a file

In [66]:
from google.colab import files
files.download('output.csv') #Downloading from Google colab

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>