In [98]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [99]:
df = pd.read_csv('carclaims.csv')

In [100]:
df.describe()

Unnamed: 0,WeekOfMonth,WeekOfMonthClaimed,Age,PolicyNumber,RepNumber,Deductible,DriverRating,Year
count,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0
mean,2.788586,2.693969,39.855707,7710.5,8.483268,407.70428,2.487808,1994.866472
std,1.287585,1.259115,13.492377,4451.514911,4.599948,43.950998,1.119453,0.803313
min,1.0,1.0,0.0,1.0,1.0,300.0,1.0,1994.0
25%,2.0,2.0,31.0,3855.75,5.0,400.0,1.0,1994.0
50%,3.0,3.0,38.0,7710.5,8.0,400.0,2.0,1995.0
75%,4.0,4.0,48.0,11565.25,12.0,400.0,3.0,1996.0
max,5.0,5.0,80.0,15420.0,16.0,700.0,4.0,1996.0


In [101]:
df.columns

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'PolicyNumber', 'RepNumber', 'Deductible',
       'DriverRating', 'Days:Policy-Accident', 'Days:Policy-Claim',
       'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder',
       'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange-Claim', 'NumberOfCars', 'Year',
       'BasePolicy', 'FraudFound'],
      dtype='object')

In [102]:
df.head(2)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No


# Preprocessessing 

### One hot encode categorical with more than 2 classes

In [103]:
cat_cols = ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 
                       'MonthClaimed', 'WeekOfMonthClaimed','PolicyType', 'VehicleCategory', 'VehiclePrice', 
                      'RepNumber', 'DriverRating', 'Days:Policy-Accident', 'Days:Policy-Claim', 
                      'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'NumberOfSuppliments', 
                      'AddressChange-Claim', 'NumberOfCars', 'Year']
prep = df.copy()
ohe = OneHotEncoder()
for col in cat_cols:
    transformed = ohe.fit_transform(df[[col]])
    a = transformed.toarray().tolist()
    prep.drop(col, axis = 1)
    prep[col] = a
    

### Change binary columns to be 0 or 1

In [104]:
binCols = ['PoliceReportFiled','WitnessPresent','FraudFound']
for col in binCols:
    a = np.where(prep[col] =='No', 0, 1)
    prep.drop(col, axis = 1)
    prep[col] = a

In [105]:
sex = np.where(prep['Sex'] == 'Male', 0, 1)
prep.drop('Sex', axis = 1)
prep['Sex'] = sex
mar = np.where(prep['MaritalStatus'] == 'Married', 0, 1)
prep.drop('MaritalStatus', axis = 1)
prep['MaritalStatus'] = mar
fault = np.where(prep['Fault'] == 'Third Party', 0, 1)
prep.drop('Fault', axis = 1)
prep['Fault'] = fault
agent = np.where(prep['AgentType'] == 'Internal', 0, 1)
prep.drop('AgentType', axis = 1)
prep['AgentType'] = agent
base = np.where(prep['BasePolicy'] == 'Collision', 0, 1)
prep.drop('BasePolicy', axis = 1)
prep['BasePolicy'] = base

In [112]:
prep.head(2)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0]",1,1,...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0,0,1,"[0.0, 0.0, 0.0, 1.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0]","[1.0, 0.0, 0.0]",1,0
1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 1.0]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0]",0,1,...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",1,0,1,"[0.0, 0.0, 0.0, 1.0]","[0.0, 0.0, 0.0, 1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0]",0,0


transformer = make_column_transformer(
    (OneHotEncoder(), ['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 
                       'MonthClaimed', 'WeekOfMonthClaimed','PolicyType', 'VehicleCategory', 'VehiclePrice', 
                      'RepNumber', 'DriverRating', 'Days:Policy-Accident', 'Days:Policy-Claim', 
                      'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'NumberOfSuppliments', 
                      'AddressChange-Claim', 'NumberOfCars', 'Year']),
    remainder='passthrough')

transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(
    transformed, 
    columns=transformer.get_feature_names()
)

### Change age to have no one be 0, change on class mean

In [107]:
yes = prep[prep['FraudFound'] == 1]
no = prep[prep['FraudFound'] == 0]
agey = yes[yes['Age'] > 0]
agen = no[no['Age'] > 0]
my = agey['Age'].mean()
mn = agen['Age'].mean()

In [108]:
for i in range(len(prep)):
    if(prep['Age'][i] == 0):
        if(prep['FraudFound'][i] == 1):
            prep['Age'][i] = my
        else:
            prep['Age'][i] = mn

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prep['Age'][i] = mn


Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'PolicyNumber', 'RepNumber', 'Deductible',
       'DriverRating', 'Days:Policy-Accident', 'Days:Policy-Claim',
       'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder',
       'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange-Claim', 'NumberOfCars', 'Year',
       'BasePolicy', 'FraudFound'],
      dtype='object')

In [114]:
prep.to_csv('preprocessed_data.csv', index = False)

15420