## CREATING MACHINE LEARNING MODEL TO PREDICT INSURANCE CLAIM 

## 1.0 IMPORT RELEVANT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error


## 2.0 LOAD THE DATA

In [2]:
train_df = pd.read_csv('C:/Users/user/Desktop/DS Projects/InsurancePrediction/insured_prepro_train.csv')
test_df = pd.read_csv('C:/Users/user/Desktop/DS Projects/InsurancePrediction/insure_prepro_test.csv')



In [3]:
train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    7160 non-null   int64  
 1   Building_Painted              7160 non-null   object 
 2   Building Dimension            7160 non-null   float64
 3   Building_Type                 7160 non-null   int64  
 4   Geo_Code                      7160 non-null   object 
 5   Claim                         7160 non-null   int64  
 6   Windows                       7160 non-null   int64  
 7   Building Dimensionis missing  7160 non-null   bool   
 8   Date_of_Occupancyis missing   7160 non-null   bool   
 9   Insured_time                  7160 non-null   int64  
 10  Building_Class                7160 non-null   object 
 11  Building_category             7160 non-null   object 
 12  Building_age                  7160 non-null   float64
dtypes: 

Unnamed: 0.1,Unnamed: 0,Building_Painted,Building Dimension,Building_Type,Geo_Code,Claim,Windows,Building Dimensionis missing,Date_of_Occupancyis missing,Insured_time,Building_Class,Building_category,Building_age
0,0,N,290.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,53.0
1,1,V,490.0,1,1053,0,2,False,False,4,modern,Rural Non-Residential,85.0
2,2,N,595.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,54.0
3,3,V,2840.0,1,1053,0,0,False,False,4,old,Urban Non-Residential,53.0
4,4,V,680.0,1,1053,0,1,False,False,4,modern,Rural Non-Residential,84.0


In [4]:
test_df.info()
test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    3069 non-null   int64  
 1   Building_Painted              3069 non-null   object 
 2   Building Dimension            3069 non-null   float64
 3   Building_Type                 3069 non-null   int64  
 4   Geo_Code                      3069 non-null   object 
 5   Windows                       3069 non-null   int64  
 6   Building Dimensionis missing  3069 non-null   bool   
 7   Date_of_Occupancyis missing   3069 non-null   bool   
 8   Insured_time                  3069 non-null   int64  
 9   Building_Class                3069 non-null   object 
 10  Building_category             3069 non-null   object 
 11  Building_age                  3069 non-null   float64
dtypes: bool(2), float64(2), int64(4), object(4)
memory usage: 245.

Unnamed: 0.1,Unnamed: 0,Building_Painted,Building Dimension,Building_Type,Geo_Code,Windows,Building Dimensionis missing,Date_of_Occupancyis missing,Insured_time,Building_Class,Building_category,Building_age
0,0,V,300.0,1,3310,1,False,False,4,modern,Rural Non-Residential,53.0
1,1,V,300.0,1,3310,1,False,False,3,modern,Rural Non-Residential,56.0
2,2,V,790.0,1,3310,0,False,False,1,old,Urban Non-Residential,53.0
3,3,V,1405.0,1,3321,1,False,False,4,modern,Rural Non-Residential,10.0
4,4,V,1405.0,1,3321,1,False,False,4,modern,Rural Non-Residential,12.0


In [5]:
# drop the 'Unnamed:0' column

train_df.drop('Unnamed: 0', axis=1, inplace=True)
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# Drop the Geo_Code feature as it contains over 1000 unique values and might not be relevant to our model
train_df.drop('Geo_Code', axis=1, inplace=True)
test_df.drop('Geo_Code', axis=1, inplace=True)

In [7]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Building_Painted              7160 non-null   object 
 1   Building Dimension            7160 non-null   float64
 2   Building_Type                 7160 non-null   int64  
 3   Claim                         7160 non-null   int64  
 4   Windows                       7160 non-null   int64  
 5   Building Dimensionis missing  7160 non-null   bool   
 6   Date_of_Occupancyis missing   7160 non-null   bool   
 7   Insured_time                  7160 non-null   int64  
 8   Building_Class                7160 non-null   object 
 9   Building_category             7160 non-null   object 
 10  Building_age                  7160 non-null   float64
dtypes: bool(2), float64(2), int64(4), object(3)
memory usage: 517.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306

## 3.0 DEFINE THE TARGET AND THE INPUT VARIABLES

In [39]:
target = train_df['Claim']
input_var = train_df.drop('Claim', axis=1)

In [40]:
input_var.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Building_Painted              7160 non-null   object 
 1   Building Dimension            7160 non-null   float64
 2   Building_Type                 7160 non-null   int64  
 3   Windows                       7160 non-null   int64  
 4   Building Dimensionis missing  7160 non-null   bool   
 5   Date_of_Occupancyis missing   7160 non-null   bool   
 6   Insured_time                  7160 non-null   int64  
 7   Building_Class                7160 non-null   object 
 8   Building_category             7160 non-null   object 
 9   Building_age                  7160 non-null   float64
dtypes: bool(2), float64(2), int64(3), object(3)
memory usage: 461.6+ KB


In [41]:
target

0       0
1       0
2       0
3       0
4       0
       ..
7155    0
7156    1
7157    0
7158    0
7159    0
Name: Claim, Length: 7160, dtype: int64

## 4.0 CATEGORICAL ENCODING AND FEATURE SCALING

In [30]:
#get the categorical features

cat = (input_var.dtypes == 'object')
cat = list(cat[cat].index)

In [42]:
#get the numerical features

num = (input_var.dtypes != 'object')
num = list(num[num].index)

In [43]:
# Use sickit-learn column transformer to perform both operations
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
         (OneHotEncoder(),cat), 
         (StandardScaler(),num), 
         remainder='passthrough')



In [44]:
column_trans.fit_transform(X_train)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.51846884,  0.37634072],
       [ 0.        ,  1.        ,  1.        , ...,  0.        ,
         0.51846884,  1.87036142],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.51846884,  0.42302887],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -2.35403844, -0.97761554],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.51846884, -0.18391704],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.51846884, -1.6312496 ]])