In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# final version will read from the database and use environment variables
df = pd.read_csv('insurance_claims.csv', na_values=['?'])
# remove one row (of 1,000) with negative umbrella policy limit
df = df[df['umbrella_limit'] >= 0]
df.dropna(axis=0, inplace=True)

In [2]:
col_names = ['months_as_customer', 'age', 'policy_state', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage', 'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'fraud_reported']
# load dataset
# dtdf = pd.read_csv('insurance_claims.csv', header=None, names=col_names)
# dtdf['policy_deductable']
data = df[col_names]
data.head()

Unnamed: 0,months_as_customer,age,policy_state,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,...,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,fraud_reported
0,328,48,OH,1000,1406.91,0,466132,MALE,MD,craft-repair,...,1,YES,1,2,YES,71610,6510,13020,52080,Y
2,134,29,OH,2000,1413.14,5000000,430632,FEMALE,PhD,sales,...,3,NO,2,3,NO,34650,7700,3850,23100,N
5,256,39,OH,1000,1351.1,0,478456,FEMALE,PhD,tech-support,...,3,NO,0,2,NO,64100,6410,6410,51280,Y
8,27,33,IL,500,1442.99,0,601734,FEMALE,PhD,other-service,...,1,NO,1,1,YES,27700,2770,2770,22160,N
11,447,61,OH,2000,1137.16,0,615561,FEMALE,High School,exec-managerial,...,3,YES,1,2,YES,114920,17680,17680,79560,N


In [3]:
data_binary_encoded = pd.get_dummies(data, columns=['insured_sex', 'insured_education_level', 'insured_occupation', 'police_report_available', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'property_damage', 'policy_state'])
data_binary_encoded.head()

Unnamed: 0,months_as_customer,age,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,...,incident_state_OH,incident_state_PA,incident_state_SC,incident_state_VA,incident_state_WV,property_damage_NO,property_damage_YES,policy_state_IL,policy_state_IN,policy_state_OH
0,328,48,1000,1406.91,0,466132,53300,0,5,1,...,0,0,1,0,0,0,1,0,0,1
2,134,29,2000,1413.14,5000000,430632,35100,0,7,3,...,0,0,0,0,0,1,0,0,0,1
5,256,39,1000,1351.1,0,478456,0,0,19,3,...,0,0,1,0,0,1,0,0,0,1
8,27,33,500,1442.99,0,601734,0,0,21,1,...,0,0,0,0,1,1,0,1,0,0
11,447,61,2000,1137.16,0,615561,0,-51000,21,3,...,0,0,1,0,0,0,1,0,0,1


In [4]:
data_binary_encoded.drop(columns = ['police_report_available_NO', 'property_damage_NO'], inplace=True)
data_binary_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 340 entries, 0 to 992
Data columns (total 90 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   months_as_customer                      340 non-null    int64  
 1   age                                     340 non-null    int64  
 2   policy_deductable                       340 non-null    int64  
 3   policy_annual_premium                   340 non-null    float64
 4   umbrella_limit                          340 non-null    int64  
 5   insured_zip                             340 non-null    int64  
 6   capital-gains                           340 non-null    int64  
 7   capital-loss                            340 non-null    int64  
 8   incident_hour_of_the_day                340 non-null    int64  
 9   number_of_vehicles_involved             340 non-null    int64  
 10  bodily_injuries                         340 non-null    int64 

In [5]:
# Using Random Forest for Classification
# split dataset in features and target variable
y = data_binary_encoded['fraud_reported'].map({'Y':1,'N':0})
X = data_binary_encoded.drop(columns=['fraud_reported'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [8]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[45  6]
 [ 8  9]]
              precision    recall  f1-score   support

           0       0.85      0.88      0.87        51
           1       0.60      0.53      0.56        17

    accuracy                           0.79        68
   macro avg       0.72      0.71      0.71        68
weighted avg       0.79      0.79      0.79        68

0.7941176470588235


In [9]:
# which features have most weight toward predicting fraud?
importances = classifier.feature_importances_
importances

array([0.03417052, 0.03323503, 0.01242731, 0.04801681, 0.01645574,
       0.02693769, 0.02321233, 0.02367853, 0.0258537 , 0.01192383,
       0.01192484, 0.01014631, 0.03088785, 0.0428986 , 0.03442493,
       0.03641335, 0.00674813, 0.00444757, 0.00597632, 0.00455092,
       0.00340876, 0.0042943 , 0.00490695, 0.00368403, 0.00213064,
       0.00055886, 0.00305287, 0.00228251, 0.00415756, 0.00650483,
       0.00234579, 0.00332179, 0.00406288, 0.0013751 , 0.0020376 ,
       0.00281959, 0.0058504 , 0.0071623 , 0.00251812, 0.00466209,
       0.00189083, 0.00134669, 0.00140429, 0.00150257, 0.00517812,
       0.04648456, 0.0346651 , 0.00177278, 0.00217695, 0.0021086 ,
       0.00164063, 0.00237979, 0.00121909, 0.0022204 , 0.00708088,
       0.00248897, 0.00184807, 0.00237647, 0.00342382, 0.00086518,
       0.00431073, 0.00470963, 0.00501862, 0.00514943, 0.0060832 ,
       0.00538681, 0.00463564, 0.00719727, 0.00770981, 0.00934826,
       0.00800305, 0.13030893, 0.03691326, 0.03954125, 0.00343

In [10]:
X.columns[71] # The largest number; An accident is more likely to involve fraud if incident_severity_Major Damage is present

'incident_severity_Major Damage'

In [11]:
# ML code above here

In [12]:
total_count = df.shape[0]
fraud_count = df[(df['fraud_reported'] == 'Y')].shape[0]
umbrella_count = df[(df['umbrella_limit'] > 0)].shape[0]
fraudumbrella_count = df[(df['fraud_reported'] == 'Y') & (df['umbrella_limit'] > 0)].shape[0]
fraudnonumbrella_count = df[(df['fraud_reported'] == 'Y') & (df['umbrella_limit'] == 0)].shape[0]
print(f'fraud % of total: {100*fraud_count / total_count}') 
print(f'umbrella % of total: {100*umbrella_count / total_count}')
print(f'fraud % of umbrella: {100*fraudumbrella_count / umbrella_count}')
print(f'fraud % of non-umbrella: {100*fraudnonumbrella_count / (total_count - umbrella_count)}')

fraud % of total: 25.58823529411765
umbrella % of total: 17.941176470588236
fraud % of umbrella: 36.0655737704918
fraud % of non-umbrella: 23.29749103942652


In [13]:
# The following values were very evenly distributed, e.g., three states about 1/3 each:
# umbrella_limit = fraudumbrella['umbrella_limit'].value_counts()
# state = X['policy_state'].value_counts()
# csl = X['policy_csl'].value_counts()
# deductable = X['policy_deductable'].value_counts()