![image](https://i.ytimg.com/vi/TbEZIZo4PZ8/maxresdefault.jpg)

# Predicting auto and insurance fraud in general:
is a contract, represented by a policy, in which an individual or entity receives financial protection or reimbursement against losses from an insurance company. The company pools clients' risks to make payments more affordable for the insured.
# Insurance Policy Components
When choosing a policy, it is important to understand how insurance works.

A firm understanding of these concepts goes a long way in helping you choose the policy that best suits your needs. For instance, whole life insurance may or may not be the right type of life insurance for you. There are three components of any type of insurance (premium, policy limit, and deductible) that are crucial.

> The goal of this note, Kuho Hwa, is to make a simplified and structured analysis to make an explanation of that dirty process called fraud and lack of it through analysis and the machine learning system.

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns


# Read data 

In [None]:
db=pd.read_csv('../input/insurance-claim-report/insurance_claims_report.csv')

In [None]:
db.head(10)

In [None]:
db.shape

In [None]:
db.info()

In [None]:
db.describe()

In [None]:
db.sum()

In [None]:
db.count()

In [None]:
pd.set_option('display.max_columns', 500)
db

In [None]:
db.mean()

In [None]:
db.max()

In [None]:
db.min()

In [None]:
db.transpose().iloc[:,:5]

In [None]:
db.isnull()

In [None]:
db.describe().transpose()

In [None]:
ax=sns.regplot(x='age',y='months_as_customer',data=db)

In [None]:
p = db['age'].value_counts()
print(p)
db['age'].value_counts().plot.bar()

In [None]:
p = db['months_as_customer'].value_counts()
print(p)
db['months_as_customer'].value_counts().plot.bar()

In [None]:
sns.countplot(data=db, x='age')

In [None]:
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
db['age'].hist(bins=30, color = "red", edgecolor='black', alpha=0.65, lw=1.5)
plt.ylabel('age')
plt.xlabel('age')


In [None]:
sns.jointplot(x = 'age', y = 'months_as_customer',
              data = db, kind = 'kde', color = "red", alpha = 0.65)

In [None]:
sns.jointplot(x = 'age', y = 'policy_deductable',
              data = db, kind = 'kde', color = "red", alpha = 0.65)

In [None]:

sns.jointplot(x = 'age', y = 'injury_claim',
              data = db, kind = 'kde', color = "red", alpha = 0.65)

In [None]:
sns.jointplot(x = 'age', y ='policy_deductable', 
              data = db,hue = 'fraud_reported',kind = 'kde', color = "red", alpha = 0.65)

In [None]:
sns.pairplot(db, hue = 'fraud_reported', palette = 'cool_r')


# Let's apply our classification models one by one:
1) Logistic Regression:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
db.columns

In [None]:
X = db[['months_as_customer', 'age', 'policy_annual_premium',
       'insured_education_level', 'auto_year', 'fraud_reported', 'property_damage',
       'umbrella_limit', 'incident_hour_of_the_day', 'incident_severity', 'authorities_contacted']]
y = db['fraud_reported']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)  # apply scaling on training data

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
predictions1 = lr.predict(X_test)

In [None]:
print(classification_report(y_test,predictions1))
print('\n')
print(confusion_matrix(y_test,predictions1))

In [None]:
from sklearn.metrics import accuracy_score
acclr = accuracy_score(y_test,predictions1)*100
acclr

# Naïve Bayes:

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
gnb

In [None]:
predictions2 = gnb.predict(X_test)
print(classification_report(y_test,predictions2))
print('\n')
print(confusion_matrix(y_test,predictions2))

In [None]:
from sklearn.metrics import accuracy_score
accgnb = accuracy_score(y_test,predictions2)*100
accgnb

# Stochastic Gradient Descent:

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train,y_train)
sgd

In [None]:
predictions3 = sgd.predict(X_test)
print(classification_report(y_test,predictions3))
print('\n')
print(confusion_matrix(y_test,predictions3))

# K-Nearest Neighbours:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(X_train,y_train)

In [None]:
predictions4 = knn.predict(X_test)
print(classification_report(y_test,predictions4))
print('\n')
print(confusion_matrix(y_test,predictions4))

In [None]:
from sklearn.metrics import accuracy_score
accknn = accuracy_score(y_test,predictions4)*100
accknn

# Descision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
predictions5 = dtree.predict(X_test)
print(classification_report(y_test,predictions5))
print('\n')
print(confusion_matrix(y_test,predictions5))

In [None]:
from sklearn.metrics import accuracy_score
accdtree = accuracy_score(y_test,predictions5)*100
accdtree

# Random Forest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)

In [None]:
predictions6 = rfc.predict(X_test)
print(classification_report(y_test,predictions6))
print('\n')
print(confusion_matrix(y_test,predictions6))

In [None]:
from sklearn.metrics import accuracy_score
accrfc = accuracy_score(y_test,predictions6)*100
accrfc

#  Support Vector Machine:

In [None]:
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train,y_train)

In [None]:
predictions7 = svc_model.predict(X_test)
print(classification_report(y_test,predictions7))
print('\n')
print(confusion_matrix(y_test,predictions7))

In [None]:
from sklearn.metrics import accuracy_score
accSVM = accuracy_score(y_test,predictions7)*100
accSVM

In [None]:
db.hist(edgecolor='black', linewidth=1.2)
fig=plt.gcf()
fig.set_size_inches(15,15)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.violinplot(x='fraud_reported',y='age',data=db)
plt.subplot(2,2,2)
sns.violinplot(x='fraud_reported',y='incident_hour_of_the_day',data=db)
plt.subplot(2,2,3)
sns.violinplot(x='fraud_reported',y='age',data=db)
plt.subplot(2,2,4)
sns.violinplot(x='fraud_reported',y='incident_hour_of_the_day',data=db)

In [None]:
plt.figure(figsize=(20,20)) 
sns.heatmap(db.corr(),annot=True,cmap='cubehelix_r') #draws  heatmap with input as the correlation matrix calculted by(iris.corr())
plt.show()

# Splitting The Data into Training And Testing Dataset

In [None]:
# in this our main data is split into train and test
train, test = train_test_split(db, test_size = 0.3)
# the attribute test_size=0.3 splits the data into 70% and 30% ratio. train=70% and test=30%
print(train.shape)
print(test.shape)