## CREATING MACHINE LEARNING MODEL TO PREDICT INSURANCE CLAIM 

## 1.0 IMPORT RELEVANT LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error


## 2.0 LOAD THE DATA

In [3]:
train_df = pd.read_csv('C:/Users/user/Desktop/DS Projects/Insurance-Prediction/insured_prepro_train.csv')
test_df = pd.read_csv('C:/Users/user/Desktop/DS Projects/Insurance-Prediction/insure_prepro_test.csv')



In [4]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    7160 non-null   int64  
 1   Building_Painted              7160 non-null   object 
 2   Building Dimension            7160 non-null   float64
 3   Building_Type                 7160 non-null   int64  
 4   Geo_Code                      7160 non-null   object 
 5   Claim                         7160 non-null   int64  
 6   Windows                       7160 non-null   int64  
 7   Building Dimensionis missing  7160 non-null   bool   
 8   Date_of_Occupancyis missing   7160 non-null   bool   
 9   Insured_time                  7160 non-null   int64  
 10  Building_Class                7160 non-null   object 
 11  Building_category             7160 non-null   object 
 12  Building_age                  7160 non-null   float64
dtypes: 

Unnamed: 0.1,Unnamed: 0,Building_Painted,Building Dimension,Building_Type,Geo_Code,Claim,Windows,Building Dimensionis missing,Date_of_Occupancyis missing,Insured_time,Building_Class,Building_category,Building_age
0,0,N,290.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,53.0
1,1,V,490.0,1,1053,0,2,False,False,4,modern,Rural Non-Residential,85.0
2,2,N,595.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,54.0
3,3,V,2840.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,53.0
4,4,V,680.0,1,1053,0,1,False,False,4,modern,Rural Non-Residential,84.0


In [5]:
test_df.info()
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    3069 non-null   int64  
 1   Building_Painted              3069 non-null   object 
 2   Building Dimension            3069 non-null   float64
 3   Building_Type                 3069 non-null   int64  
 4   Geo_Code                      3069 non-null   object 
 5   Windows                       3069 non-null   int64  
 6   Building Dimensionis missing  3069 non-null   bool   
 7   Date_of_Occupancyis missing   3069 non-null   bool   
 8   Insured_time                  3069 non-null   int64  
 9   Building_Class                3069 non-null   object 
 10  Building_category             3069 non-null   object 
 11  Building_age                  3069 non-null   float64
dtypes: bool(2), float64(2), int64(4), object(4)
memory usage: 245.

Unnamed: 0.1,Unnamed: 0,Building_Painted,Building Dimension,Building_Type,Geo_Code,Windows,Building Dimensionis missing,Date_of_Occupancyis missing,Insured_time,Building_Class,Building_category,Building_age
0,0,V,300.0,1,3310,1,False,False,4,modern,Rural Non-Residential,53.0
1,1,V,300.0,1,3310,1,False,False,3,modern,Rural Non-Residential,56.0
2,2,V,790.0,1,3310,0,False,False,1,old,Urban Non-Residential,53.0
3,3,V,1405.0,1,3321,1,False,False,4,modern,Rural Non-Residential,10.0
4,4,V,1405.0,1,3321,1,False,False,4,modern,Rural Non-Residential,12.0


In [6]:
# drop the 'Unnamed:0' column

train_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
# Drop the Geo_Code feature as it contains over 1000 unique values and might not be relevant to our model
train_df.drop('Geo_Code', axis=1, inplace=True)
test_df.drop('Geo_Code', axis=1, inplace=True)

In [8]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Building_Painted              7160 non-null   object 
 1   Building Dimension            7160 non-null   float64
 2   Building_Type                 7160 non-null   int64  
 3   Claim                         7160 non-null   int64  
 4   Windows                       7160 non-null   int64  
 5   Building Dimensionis missing  7160 non-null   bool   
 6   Date_of_Occupancyis missing   7160 non-null   bool   
 7   Insured_time                  7160 non-null   int64  
 8   Building_Class                7160 non-null   object 
 9   Building_category             7160 non-null   object 
 10  Building_age                  7160 non-null   float64
dtypes: bool(2), float64(2), int64(4), object(3)
memory usage: 517.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306

## 3.0 DEFINE THE TARGET AND THE INPUT VARIABLES

In [9]:
target = train_df['Claim']
input_var = train_df.drop('Claim', axis=1)

In [10]:
input_var.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Building_Painted              7160 non-null   object 
 1   Building Dimension            7160 non-null   float64
 2   Building_Type                 7160 non-null   int64  
 3   Windows                       7160 non-null   int64  
 4   Building Dimensionis missing  7160 non-null   bool   
 5   Date_of_Occupancyis missing   7160 non-null   bool   
 6   Insured_time                  7160 non-null   int64  
 7   Building_Class                7160 non-null   object 
 8   Building_category             7160 non-null   object 
 9   Building_age                  7160 non-null   float64
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 461.6+ KB


In [11]:
target

0       0
1       0
2       0
3       0
4       0
       ..
7155    0
7156    1
7157    0
7158    0
7159    0
Name: Claim, Length: 7160, dtype: int64

## 4.0 SPLIT DATA INTO TRAIN AND TEST

In [12]:
X_train, X_test, y_train, y_test = train_test_split(input_var,target)

## 5.0 CATEGORICAL ENCODING AND FEATURE SCALING

In [13]:
#get the categorical features

cat = (X_train.dtypes == 'object')
cat = list(cat[cat].index)

In [14]:
#get the numerical features

num = (X_train.dtypes != 'object')
num = list(num[num].index)

In [15]:
# Use sickit-learn column transformer to perform both operations
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
         (OneHotEncoder(),cat), 
         (StandardScaler(),num), 
         remainder='passthrough')



## 6.0 DECISON TREE ALGORITHM

In [16]:
#ceate a pipeline for preprocessing and creating an instance of the model
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

pipe = Pipeline([('preprocess',column_trans),('DecTree',DecisionTreeClassifier())])

In [17]:
#check the model performance using the train and test data set
from sklearn.model_selection import cross_val_score

train_score = cross_val_score(pipe,X_train,y_train,cv=10,scoring='accuracy').mean()
test_score =  cross_val_score(pipe,X_test,y_test,cv=10,scoring='accuracy').mean()


print('train score is {0} \ntest score is {1}'.format(train_score,test_score))

train score is 0.6918063314711359 
test score is 0.6731843575418994


## 6.1 FIT PIPELINE INTO TRAIN DATA


In [18]:
pipe.fit(X_train,y_train);

## 6.2 MAKE PREDICTION WITH THE MODEL

In [19]:
y_predict = pipe.predict(X_test)

mae = mean_absolute_error(y_predict,y_test)
r2 = r2_score(y_predict,y_test)

print('mean absolute error is {} \nr2_score is {}'.format(mae,r2))

mean absolute error is 0.31620111731843575 
r2_score is -0.7379952516228196


## 6.3 OPTIMISING THE MODEL USING HYPERPARAMETER

In [20]:
# declare hyperparameters to be used

param_grids = [
    {
        'DecTree__criterion':['gini','entropy'],
        'DecTree__splitter':['best','random'],
        'DecTree__max_depth':np.linspace(1,35,5),
        #'DecTree__min_sample_split':np.linspace(0.1,1,5),
        #'DecTree__min_sample_leaf':np.linspace(0.1,0.5,5),
        'DecTree__max_features':list(range(1,X_train.shape[0]))
            
}
]

In [21]:
from sklearn.model_selection import GridSearchCV

best_fit = GridSearchCV(pipe,param_grid=param_grids,cv=2,verbose=10,n_jobs=-1)

In [22]:
best_fit.fit(X_train,y_train)

Fitting 2 folds for each of 107380 candidates, totalling 214760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1894s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1217s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 308 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 368 tasks      | elapsed:    6.6s
[Parallel(n_jo

[Parallel(n_jobs=-1)]: Done 34180 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 34704 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 35236 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 35768 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 36308 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 36848 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 37396 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 37944 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 38500 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 39056 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 39620 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 40184 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 40756 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 41328 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 41908 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 42488 tasks 

[Parallel(n_jobs=-1)]: Done 197080 tasks      | elapsed: 27.7min
[Parallel(n_jobs=-1)]: Done 199136 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0567s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1919s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0947s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1396s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 201044 tasks      | elapsed: 28.1min
[Parallel(n_jobs=-1)]: Done 203116 tasks      | elapsed: 28.3min
[Parallel(n_jobs=-1)]: Done 205204 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-1)]: Done 207292 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 209396 tasks      | elapsed: 29.0min
[Parallel(n_jobs=-1)]: Done 211500 tasks      | elapsed: 29.2min
[Parallel(n_jobs=-1)]: Done 213620 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 214760 out of 214760 | elapsed: 29.6min finished


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(),
                                                                         ['Building_Painted',
                                                                          'Building_Class',
                                                                          'Building_category']),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         ['Building '
                                                                          'Dimension',
                          

## 7.0 MAKE AND SAVE PREDICTION FOR THE TEST FILE

In [29]:
#create a dataframe with 2 columns - Customer Id and Claim 
# first read in the original test_data to access the Customer Id column

test_data = pd.read_csv('C:/Users/user/Downloads/test_data.csv')
result = pd.DataFrame(test_data['Customer Id'])
prediction = best_fit.predict(test_df)
result['Claim'] = prediction

In [30]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Customer Id  3069 non-null   object
 1   Claim        3069 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 48.1+ KB


In [33]:
#save the dataframe as a csv file

result.to_csv('dectree_insured_result.csv')

In [34]:
pd.read_csv('dectree_insured_result.csv')

Unnamed: 0.1,Unnamed: 0,Customer Id,Claim
0,0,H11920,0
1,1,H11921,0
2,2,H9805,0
3,3,H7493,0
4,4,H7494,0
...,...,...,...
3064,3064,H11583,0
3065,3065,H11720,0
3066,3066,H11721,0
3067,3067,H12408,0
