In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# final version will read from the database and use environment variables
df = pd.read_csv('insurance_claims.csv', na_values=['?'])
# remove one row (of 1,000) with negative umbrella policy limit
df = df[df['umbrella_limit'] >= 0]
df.dropna(axis=0, inplace=True)

In [None]:
col_names = ['months_as_customer', 'age', 'policy_state', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage', 'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'fraud_reported']
# load dataset
# dtdf = pd.read_csv('insurance_claims.csv', header=None, names=col_names)
# dtdf['policy_deductable']
data = df[col_names]
data.head()

In [None]:
data_binary_encoded = pd.get_dummies(data, columns=['insured_sex', 'insured_education_level', 'insured_occupation', 'police_report_available', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'property_damage', 'policy_state'])
data_binary_encoded.head()

In [None]:
data_binary_encoded.drop(columns = ['police_report_available_NO', 'property_damage_NO'], inplace=True)
data_binary_encoded.info()

In [None]:
# Using Random Forest for Classification
# split dataset in features and target variable
y = data_binary_encoded['fraud_reported'].map({'Y':1,'N':0})
X = data_binary_encoded.drop(columns=['fraud_reported'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
# which features have most weight toward predicting fraud?
importances = classifier.feature_importances_
importances

In [None]:
X.columns[71] # The largest number; An accident is more likely to involve fraud if incident_severity_Major Damage is present

In [None]:
# ML code above here

In [None]:
total_count = df.shape[0]
fraud_count = df[(df['fraud_reported'] == 'Y')].shape[0]
umbrella_count = df[(df['umbrella_limit'] > 0)].shape[0]
fraudumbrella_count = df[(df['fraud_reported'] == 'Y') & (df['umbrella_limit'] > 0)].shape[0]
fraudnonumbrella_count = df[(df['fraud_reported'] == 'Y') & (df['umbrella_limit'] == 0)].shape[0]
print(f'fraud % of total: {100*fraud_count / total_count}') 
print(f'umbrella % of total: {100*umbrella_count / total_count}')
print(f'fraud % of umbrella: {100*fraudumbrella_count / umbrella_count}')
print(f'fraud % of non-umbrella: {100*fraudnonumbrella_count / (total_count - umbrella_count)}')

In [None]:
# The following values were very evenly distributed, e.g., three states about 1/3 each:
# umbrella_limit = fraudumbrella['umbrella_limit'].value_counts()
# state = X['policy_state'].value_counts()
# csl = X['policy_csl'].value_counts()
# deductable = X['policy_deductable'].value_counts()