## CREATING MACHINE LEARNING MODEL TO PREDICT INSURANCE CLAIM 

## 1.0 IMPORT RELEVANT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error


## 2.0 LOAD THE DATA

In [2]:
train_df = pd.read_csv('C:/Users/user/Desktop/DS Projects/Insurance-Prediction/insured_prepro_train.csv')
test_df = pd.read_csv('C:/Users/user/Desktop/DS Projects/Insurance-Prediction/insure_prepro_test.csv')



In [3]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    7160 non-null   int64  
 1   Building_Painted              7160 non-null   object 
 2   Building Dimension            7160 non-null   float64
 3   Building_Type                 7160 non-null   int64  
 4   Geo_Code                      7160 non-null   object 
 5   Claim                         7160 non-null   int64  
 6   Windows                       7160 non-null   int64  
 7   Building Dimensionis missing  7160 non-null   bool   
 8   Date_of_Occupancyis missing   7160 non-null   bool   
 9   Insured_time                  7160 non-null   int64  
 10  Building_Class                7160 non-null   object 
 11  Building_category             7160 non-null   object 
 12  Building_age                  7160 non-null   float64
dtypes: 

Unnamed: 0.1,Unnamed: 0,Building_Painted,Building Dimension,Building_Type,Geo_Code,Claim,Windows,Building Dimensionis missing,Date_of_Occupancyis missing,Insured_time,Building_Class,Building_category,Building_age
0,0,N,290.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,53.0
1,1,V,490.0,1,1053,0,2,False,False,4,modern,Rural Non-Residential,85.0
2,2,N,595.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,54.0
3,3,V,2840.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,53.0
4,4,V,680.0,1,1053,0,1,False,False,4,modern,Rural Non-Residential,84.0


In [4]:
test_df.info()
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    3069 non-null   int64  
 1   Building_Painted              3069 non-null   object 
 2   Building Dimension            3069 non-null   float64
 3   Building_Type                 3069 non-null   int64  
 4   Geo_Code                      3069 non-null   object 
 5   Windows                       3069 non-null   int64  
 6   Building Dimensionis missing  3069 non-null   bool   
 7   Date_of_Occupancyis missing   3069 non-null   bool   
 8   Insured_time                  3069 non-null   int64  
 9   Building_Class                3069 non-null   object 
 10  Building_category             3069 non-null   object 
 11  Building_age                  3069 non-null   float64
dtypes: bool(2), float64(2), int64(4), object(4)
memory usage: 245.

Unnamed: 0.1,Unnamed: 0,Building_Painted,Building Dimension,Building_Type,Geo_Code,Windows,Building Dimensionis missing,Date_of_Occupancyis missing,Insured_time,Building_Class,Building_category,Building_age
0,0,V,300.0,1,3310,1,False,False,4,modern,Rural Non-Residential,53.0
1,1,V,300.0,1,3310,1,False,False,3,modern,Rural Non-Residential,56.0
2,2,V,790.0,1,3310,0,False,False,1,old,Urban Non-Residential,53.0
3,3,V,1405.0,1,3321,1,False,False,4,modern,Rural Non-Residential,10.0
4,4,V,1405.0,1,3321,1,False,False,4,modern,Rural Non-Residential,12.0


In [5]:
# drop the 'Unnamed:0' column

train_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# Drop the Geo_Code feature as it contains over 1000 unique values and might not be relevant to our model
train_df.drop('Geo_Code', axis=1, inplace=True)
test_df.drop('Geo_Code', axis=1, inplace=True)

In [7]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Building_Painted              7160 non-null   object 
 1   Building Dimension            7160 non-null   float64
 2   Building_Type                 7160 non-null   int64  
 3   Claim                         7160 non-null   int64  
 4   Windows                       7160 non-null   int64  
 5   Building Dimensionis missing  7160 non-null   bool   
 6   Date_of_Occupancyis missing   7160 non-null   bool   
 7   Insured_time                  7160 non-null   int64  
 8   Building_Class                7160 non-null   object 
 9   Building_category             7160 non-null   object 
 10  Building_age                  7160 non-null   float64
dtypes: bool(2), float64(2), int64(4), object(3)
memory usage: 517.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306

## 3.0 DEFINE THE TARGET AND THE INPUT VARIABLES

In [8]:
target = train_df['Claim']
input_var = train_df.drop('Claim', axis=1)

In [9]:
input_var.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Building_Painted              7160 non-null   object 
 1   Building Dimension            7160 non-null   float64
 2   Building_Type                 7160 non-null   int64  
 3   Windows                       7160 non-null   int64  
 4   Building Dimensionis missing  7160 non-null   bool   
 5   Date_of_Occupancyis missing   7160 non-null   bool   
 6   Insured_time                  7160 non-null   int64  
 7   Building_Class                7160 non-null   object 
 8   Building_category             7160 non-null   object 
 9   Building_age                  7160 non-null   float64
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 461.6+ KB


In [10]:
target

0       0
1       0
2       0
3       0
4       0
       ..
7155    0
7156    1
7157    0
7158    0
7159    0
Name: Claim, Length: 7160, dtype: int64

## 4.0 SPLIT DATA INTO TRAIN AND TEST

In [11]:
X_train, X_test, y_train, y_test = train_test_split(input_var,target)

## 5.0 CATEGORICAL ENCODING AND FEATURE SCALING

In [12]:
#get the categorical features

cat = (X_train.dtypes == 'object')
cat = list(cat[cat].index)

In [13]:
#get the numerical features

num = (X_train.dtypes != 'object')
num = list(num[num].index)

In [14]:
# Use sickit-learn column transformer to perform both operations
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
         (OneHotEncoder(),cat), 
         (StandardScaler(),num), 
         remainder='passthrough')



## 6.0 USING ML PIPELINE 

In [15]:
#create a pipeline for preprocessing and creating an instance of the model

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


pipe_lr = Pipeline([('preprocess',column_trans),('lr',LogisticRegression())])
pipe_dt = Pipeline([('preprocess',column_trans),('DecTree',DecisionTreeClassifier())])
pipe_svc = Pipeline([('preprocess',column_trans),('svm',SVC())])
pipe_rf = Pipeline([('preprocess',column_trans),('rf',RandomForestClassifier())])

In [16]:
# make a list of the pipelines

pipelines = [pipe_lr,pipe_dt,pipe_svc,pipe_rf]

In [17]:
# save the best accordingly
best_accuracy = 0.0
best_classifier = 0
best_pipelne = ''

In [31]:
# Dictionary of pipelines and classifier for ease of reference

pipe_dict = {0:'Logistic regression',1:'Decision tree', 2:'Support Vector Classifier', 3:'Random forest'}

#fit the pipelines into the train data
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [32]:
pipe_rf.score(X_test,y_test)
for i in pipelines:
    print(i.score(X_test,y_test))

0.7821229050279329
0.6843575418994413
0.7810055865921788
0.7564245810055866


In [37]:
for i, model in enumerate(pipelines):
    print('test_accuracy of {} is {:.3f}'.format(pipe_dict[i],model.score(X_test,y_test)))

test_accuracy of Logistic regression is 0.782
test_accuracy of Decision tree is 0.684
test_accuracy of Support Vector Classifier is 0.781
test_accuracy of Random forest is 0.756


In [36]:
# checking for the best accuracy

for i, model in enumerate(pipelines):
    if model.score(X_test,y_test) > best_accuracy:
        best_accuracy = model.score(X_test,y_test)
        best_pipeline = model
        best_classifier = i
        
print('Classifier with the best accuracy is {}'.format(pipe_dict[best_classifier]))

Classifier with the best accuracy is Logistic regression


# 7.0 OPTIMISING THE MODEL PERFORMANCE

In [46]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV

#create a pipeline
pipe = Pipeline([('preprocess',column_trans),('classifier', LogisticRegression())])

#create dicctionary with the different models and their hypeparameters
grid_param = [
                {'classifier':[LogisticRegression()],
                 'classifier__penalty':['l2','l1'],
                  'classifier__C':np.logspace(0,4,10)
                },
    
                {'classifier':[LogisticRegression()],
                 'classifier__penalty':['l2'],
                  'classifier__C':np.logspace(0,4,10),
                  'classifier__solver':['newton-cg','saga','sag','liblinear']
                },
    
                {'classifier':[RandomForestClassifier()],
                 'classifier__n_estimators':[10,100,500,100],
                 'classifier__max_depth':[5,8,15,25,30,50,None],
                 'classifier__min_samples_leaf':[1,2,5,10,15,100],
                 'classifier__max_leaf_nodes':[2,5,10]
                }
                
]

grid_search = GridSearchCV(pipe,grid_param, cv=5, verbose=0,n_jobs=-1)
best_model = grid_search.fit(X_train,y_train)



In [48]:
grid_search.score(X_test,y_test)

0.7821229050279329

In [None]:
#print
print(best_model.best_estimator_)
print()