In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

# final version will read from the database and use environment variables
df = pd.read_csv('insurance_claims.csv', na_values=['?'])
# remove one row (of 1,000) with negative umbrella policy limit
df = df[df['umbrella_limit'] >= 0]
df.dropna(axis=0, inplace=True)

In [None]:
# machine learning code here
# Using Decision Tree for Classification
# print(df.isna().sum())
y = df['fraud_reported']
X = df.drop(columns = ['fraud_reported'])

In [None]:
col_names = ['months_as_customer', 'age', 'policy_state', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage', 'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'fraud_reported']
# load dataset
# dtdf = pd.read_csv('insurance_claims.csv', header=None, names=col_names)
# dtdf['policy_deductable']
data = df[col_names]
data.head()

In [None]:
data_binary_encoded = pd.get_dummies(data, columns=['insured_sex', 'insured_education_level', 'insured_occupation', 'police_report_available', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'property_damage', 'policy_state'])
data_binary_encoded.head()

In [None]:
data_binary_encoded.drop(columns = ['police_report_available_NO', 'property_damage_NO'], inplace=True)
data_binary_encoded.info()

In [None]:
# split dataset in features and target variable
y = data_binary_encoded['fraud_reported']
X = data_binary_encoded.drop(columns=['fraud_reported'])

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test,y_pred))

In [None]:
model_all_params = clf
# Prepare a plot figure with set size.
plt.figure(figsize = (20,10))
# Plot the decision tree, showing the decisive values and the improvements in Gini impurity along the way.
tree.plot_tree(model_all_params, 
          filled=True      )
# Display the tree plot figure.
plt.show()

In [None]:
# ML code above here

In [None]:
total_count = df.shape[0]
fraud_count = df[(df['fraud_reported'] == 'Y')].shape[0]
umbrella_count = df[(df['umbrella_limit'] > 0)].shape[0]
fraudumbrella_count = df[(df['fraud_reported'] == 'Y') & (df['umbrella_limit'] > 0)].shape[0]
fraudnonumbrella_count = df[(df['fraud_reported'] == 'Y') & (df['umbrella_limit'] == 0)].shape[0]
print(f'fraud % of total: {100*fraud_count / total_count}') 
print(f'umbrella % of total: {100*umbrella_count / total_count}')
print(f'fraud % of umbrella: {100*fraudumbrella_count / umbrella_count}')
print(f'fraud % of non-umbrella: {100*fraudnonumbrella_count / (total_count - umbrella_count)}')

In [None]:
# The following values were very evenly distributed, e.g., three states about 1/3 each:
# umbrella_limit = fraudumbrella['umbrella_limit'].value_counts()
# state = X['policy_state'].value_counts()
# csl = X['policy_csl'].value_counts()
# deductable = X['policy_deductable'].value_counts()