### Problem Statement:
Business case:
Insurance fraud is a huge problem in the industry. It's difficult to identify fraud claims. Machine Learning is in a unique position to help the Auto Insurance industry with this problem.

In this project, you are provided a dataset which has the details of the insurance policy along with the customer details. It also has the details of the accident on the basis of which the claims have been made. 

In this example, you will be working with some auto insurance data to demonstrate how you can create a predictive model that predicts if an insurance claim is fraudulent or not. 

In [None]:
#Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from scipy.stats import zscore #to remove outliers
from scipy.stats import skew
import requests
import pandas_profiling
import io
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Importing dataset

In [None]:
df = pd.read_csv("Automobile_insurance_fraud.csv")

# eda

In [None]:
df.head(8)

In [None]:
df

In [None]:
df.shape # check the data dimension

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.drop(columns=["_c39", "policy_number"], axis=1,inplace=True)

In [None]:
df.columns.astype(object) # check the column names for EDA

In [None]:
sns.countplot(x = 'policy_state', data= df)

In [None]:
sns.countplot(x = 'insured_sex', data= df)

In [None]:
sns.countplot(x = 'insured_education_level', data= df)

In [None]:
sns.countplot(x = 'incident_type', data= df)

In [None]:
sns.countplot(x = 'collision_type', data= df)

In [None]:
sns.countplot(x = 'property_damage', data= df)

In [None]:
sns.countplot(x = 'bodily_injuries', data= df)

In [None]:
sns.countplot(x = 'witnesses', data= df)

In [None]:
df1 = df[['months_as_customer', 'age', 'policy_bind_date', 'policy_state',
         'policy_csl', 'policy_deductable', 'policy_annual_premium',
         'umbrella_limit', 'insured_zip', 'insured_sex',
         'insured_education_level', 'insured_occupation', 'insured_hobbies',
         'insured_relationship', 'capital-gains', 'capital-loss',
         'incident_date', 'incident_type', 'collision_type', 'incident_severity',
         'authorities_contacted', 'incident_state', 'incident_city',
         'incident_location', 'incident_hour_of_the_day',
         'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
         'witnesses', 'police_report_available', 'total_claim_amount',
         'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
         'auto_model', 'auto_year', 'fraud_reported']]

In [None]:
df[['months_as_customer', 'age', 'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable', 'policy_annual_premium',
         'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies','insured_relationship', 'capital-gains', 'capital-loss',
         'incident_date', 'incident_type', 'collision_type', 'incident_severity']]

In [None]:
df[['authorities_contacted', 'incident_state', 'incident_city',
         'incident_location', 'incident_hour_of_the_day',
         'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
         'witnesses', 'police_report_available', 'total_claim_amount',
         'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
         'auto_model', 'auto_year', 'fraud_reported']]

In [None]:
df2 = df1[['months_as_customer', 'age', 'policy_deductable','policy_annual_premium','umbrella_limit','capital-gains','capital-loss',
   'incident_hour_of_the_day','number_of_vehicles_involved', 'bodily_injuries',
         'witnesses', 'total_claim_amount','injury_claim', 'property_claim', 'vehicle_claim', 'auto_year']]

In [None]:
df2 = df2.astype(int)

In [None]:
df1.drop(columns=['months_as_customer', 'age', 'policy_deductable','policy_annual_premium','umbrella_limit','capital-gains','capital-loss',
                  'incident_hour_of_the_day','number_of_vehicles_involved', 'bodily_injuries','witnesses', 'total_claim_amount','injury_claim', 'property_claim', 'vehicle_claim', 'auto_year'],
                 axis= 1, inplace= True)

In [None]:
from sklearn.preprocessing import LabelEncoder

# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df1 = df1.apply(LabelEncoder().fit_transform)

In [None]:
df1 = df1.join(df2)

In [None]:
df1.info()

## handle null values

In [None]:
df=df1

In [None]:
df.isna().sum() #checked for null values, there are no null values in the dataset

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df.isnull())
plt.title("Null Values")
plt.show()

In [None]:
for col in df:
    print(col)
    
    plt.figure()
    sns.kdeplot(df[col], shade = True)
    plt.show()

In [None]:
#Dataset is imbalanced

In [None]:
df.columns

In [None]:
pre_profile = df.profile_report(title="Automobile_insurance_fraud")

In [None]:
pre_profile

## check for outliers

In [None]:
#remove outliers before skewness check and before x, y split

In [None]:
df.boxplot(figsize=[20,8])
plt.subplots_adjust(bottom=0.25)
plt.show()

In [None]:
#Removing outliers by z score

In [None]:
from scipy.stats import zscore
z = np.abs(zscore(df))
new_df = df[(z<3).all(axis=1)]

In [None]:
new_df.shape

In [None]:
df.shape

In [None]:
dataloss = ((1000-980)/1000)*100

In [None]:
dataloss

In [None]:
#Outliers are removed since data loss is less than 7%.
df = new_df

## check co-relation

In [None]:
plt.figure(figsize=[22,12])
cor = df.corr()
sns.heatmap(cor, annot = True)
plt.show()

In [None]:
#Arrange co-relation in descending order. Dropping columns should be the last option to prevent data loss.

In [None]:
cor['fraud_reported'].sort_values(ascending=False)

In [None]:
df.drop(columns='incident_date', axis=1, inplace=True)

In [None]:
df.columns

## check for skewness

In [None]:
x = df.drop('fraud_reported',axis=1)
y = df['fraud_reported']

In [None]:
x.skew() # check skewness

In [None]:
from sklearn.preprocessing import power_transform
df_new = power_transform(x)

df_new = pd.DataFrame(df_new, columns = x.columns)

In [None]:
df_new.skew()

In [None]:
x.agg(['skew', 'kurtosis']).transpose()

In [None]:
x = df_new

## find best random state

In [None]:
#for classification problems

In [None]:
from sklearn.linear_model import LogisticRegression
maxAccu=0
maxRS=0
for i in range(1,200):
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.30, random_state = i)
    LR = LogisticRegression()
    LR.fit(x_train, y_train)
    predLR = LR.predict(x_test)
    acc = accuracy_score(y_test, predLR)
    if acc>maxAccu:
        maxAccu = acc
        maxRS=i
print("Best accuracy is", maxAccu," on Random State ",maxRS)

In [None]:
#We have found the best random state. We will create our train_test_split using this random state.

## test train split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=63)

## handle data imbalance

In [None]:
sns.countplot(df['fraud_reported'])

In [None]:
y.value_counts()

In [None]:
!pip install imbalanced_learn
!pip install delayed

In [None]:
from imblearn.over_sampling import SMOTE
oversample=SMOTE(k_neighbors=4)
#transform the dataset
x,y=oversample.fit_resample(x,y)

In [None]:
y.value_counts()

### classification 

In [None]:
#logistic_regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train, y_train)
predlr = LR.predict(x_test)

print(accuracy_score(y_test,predlr))
print(confusion_matrix(y_test,predlr))
print(classification_report(y_test,predlr))
lr_acc = accuracy_score(y_test,predlr) *100
lr_acc

In [None]:
#support vector machines 

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train,y_train)
predsvc = svc.predict(x_test)

print(accuracy_score(y_test,predsvc))
print(confusion_matrix(y_test,predsvc))
print(classification_report(y_test,predsvc))
svc_acc = accuracy_score(y_test,predsvc) *100
svc_acc

In [None]:
#decisiontree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
preddt = dt.predict(x_test)

print(accuracy_score(y_test,preddt))
print(classification_report(y_test,preddt))
print(confusion_matrix(y_test,preddt))
dt_acc = accuracy_score(y_test,preddt)*100
dt_acc

In [None]:
#random forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train,y_train)
predrf = rf.predict(x_test)

print(accuracy_score(y_test,predrf))
print(classification_report(y_test,predrf))
print(confusion_matrix(y_test,predrf))
rf_acc = accuracy_score(y_test,predrf)*100
rf_acc

## check cross-validation scores

In [None]:
from sklearn.model_selection import cross_val_score

scr = cross_val_score(LR, x, y, cv=5)
print("CrossValidation Score of LogisticRegression Model: ", scr.mean())
lr_cv = scr.mean() *100
lr_cv

In [None]:
from sklearn.model_selection import cross_val_score

scr = cross_val_score(svc, x, y, cv=5)
print("CrossValidation Score of SVC Model: ", scr.mean())
svc_cv = scr.mean() *100
svc_cv

In [None]:
from sklearn.model_selection import cross_val_score

scr = cross_val_score(dt, x, y, cv=5)
print("CrossValidation Score of DecisionTree Model: ", scr.mean())
dt_cv = scr.mean() *100
dt_cv

In [None]:
from sklearn.model_selection import cross_val_score

scr = cross_val_score(rf, x, y, cv=5)
print("CrossValidation Score of RandomForest Model: ", scr.mean())
rf_cv = scr.mean() *100
rf_cv

In [None]:
#Model with least difference between Model accuracy and cross validation is selected as the best model
#LR -> 
lr_acc - lr_cv

In [None]:
# SVM -> 
svc_acc - svc_cv

In [None]:
# DT -> 
dt_acc - dt_cv

In [None]:
# RF -> 
rf_acc - rf_cv

## hyper parameter tuning

In [None]:
### Manual Hyperparameter Tuning
model=RandomForestClassifier(n_estimators=300,criterion='entropy',max_features='sqrt',min_samples_leaf=10,random_state=100).fit(x_train,y_train)
predictions=model.predict(x_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

In [None]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(x_train,y_train)

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv

In [None]:
best_random_grid=rf_randomcv.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

## Grid Search CV

In [None]:
rf_randomcv.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

In [None]:
#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
best_grid=grid_search.best_estimator_

In [None]:
best_grid

In [None]:
y_pred=best_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

## saving the model

In [None]:
import joblib
joblib.dump(best_grid, "model.pkl") #rename as per project name
prediction = model.predict(x_test)