In [114]:
# import all the tools we need


# Regular EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


# we want our plots to apear within notebooks
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style("darkgrid")


# Models from scikit-learn & XGboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2


# Model Evaluation libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, accuracy_score
from sklearn.metrics import plot_roc_curve

In [115]:
!pip install plotly

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots



In [116]:
from bokeh.io import output_file,show,output_notebook,push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource,HoverTool,CategoricalColorMapper
from bokeh.layouts import row,column,gridplot
from bokeh.models.widgets import Tabs,Panel
output_notebook()

In [117]:
df = pd.read_csv(r'C:\Users\isarachchand\Documents\git\apf\datasets\fraud\Vehicle_claim_insurance_fraud\data\insurance-fraud.csv')

In [118]:
df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [119]:
fraud_df = df[df['FraudFound_P']==0].iloc[:923]
not_fraud_df = df[df['FraudFound_P']==1]
bigdata = fraud_df.append(not_fraud_df)
bigdata['FraudFound_P'].value_counts()

0    923
1    923
Name: FraudFound_P, dtype: int64

# EDA

In [120]:
bigdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1846 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 1846 non-null   object
 1   WeekOfMonth           1846 non-null   int64 
 2   DayOfWeek             1846 non-null   object
 3   Make                  1846 non-null   object
 4   AccidentArea          1846 non-null   object
 5   DayOfWeekClaimed      1846 non-null   object
 6   MonthClaimed          1846 non-null   object
 7   WeekOfMonthClaimed    1846 non-null   int64 
 8   Sex                   1846 non-null   object
 9   MaritalStatus         1846 non-null   object
 10  Age                   1846 non-null   int64 
 11  Fault                 1846 non-null   object
 12  PolicyType            1846 non-null   object
 13  VehicleCategory       1846 non-null   object
 14  VehiclePrice          1846 non-null   object
 15  FraudFound_P          1846 non-null  

In [121]:
bigdata.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
WeekOfMonth,1846.0,2.78,1.29,1.0,2.0,3.0,4.0,5.0
WeekOfMonthClaimed,1846.0,2.67,1.26,1.0,2.0,3.0,4.0,5.0
Age,1846.0,38.57,14.07,0.0,30.0,37.0,47.0,80.0
FraudFound_P,1846.0,0.5,0.5,0.0,0.0,0.5,1.0,1.0
PolicyNumber,1846.0,3923.47,4652.26,1.0,462.25,923.5,7012.75,15420.0
RepNumber,1846.0,8.34,4.58,1.0,5.0,8.0,12.0,16.0
Deductible,1846.0,408.5,43.41,300.0,400.0,400.0,400.0,700.0
DriverRating,1846.0,2.53,1.12,1.0,2.0,3.0,4.0,4.0
Year,1846.0,1994.39,0.69,1994.0,1994.0,1994.0,1995.0,1996.0


In [122]:
bigdata.isna().sum()

Month                   0
WeekOfMonth             0
DayOfWeek               0
Make                    0
AccidentArea            0
DayOfWeekClaimed        0
MonthClaimed            0
WeekOfMonthClaimed      0
Sex                     0
MaritalStatus           0
Age                     0
Fault                   0
PolicyType              0
VehicleCategory         0
VehiclePrice            0
FraudFound_P            0
PolicyNumber            0
RepNumber               0
Deductible              0
DriverRating            0
Days_Policy_Accident    0
Days_Policy_Claim       0
PastNumberOfClaims      0
AgeOfVehicle            0
AgeOfPolicyHolder       0
PoliceReportFiled       0
WitnessPresent          0
AgentType               0
NumberOfSuppliments     0
AddressChange_Claim     0
NumberOfCars            0
Year                    0
BasePolicy              0
dtype: int64

In [123]:
#Categorical columns
cat_col = [col for col in bigdata.columns if bigdata[col].dtypes == "O"]
cat_col

['Month',
 'DayOfWeek',
 'Make',
 'AccidentArea',
 'DayOfWeekClaimed',
 'MonthClaimed',
 'Sex',
 'MaritalStatus',
 'Fault',
 'PolicyType',
 'VehicleCategory',
 'VehiclePrice',
 'Days_Policy_Accident',
 'Days_Policy_Claim',
 'PastNumberOfClaims',
 'AgeOfVehicle',
 'AgeOfPolicyHolder',
 'PoliceReportFiled',
 'WitnessPresent',
 'AgentType',
 'NumberOfSuppliments',
 'AddressChange_Claim',
 'NumberOfCars',
 'BasePolicy']

In [124]:
df_month = bigdata.groupby("Month").agg({"Month":"count"})
df_month.columns = ["Counts"]
df_month.reset_index(inplace=True)

fig = px.bar(df_month, x='Month', y='Counts', title="Number of accidents per month")
fig.show()

In [125]:


# Gender and marital status of the accident victims (consider with all years)
df_sex_maritalstatus = bigdata.groupby(["Sex", "MaritalStatus"]).agg({"Sex":"count"})
df_sex_maritalstatus.columns = ["Counts"]
df_sex_maritalstatus.reset_index(inplace=True)
print(df_sex_maritalstatus.head(10))


fig = px.bar(df_sex_maritalstatus, x="Sex", y="Counts",
             color='MaritalStatus', barmode='group',
             height=400,
            title = "Gender and marital status of the accident victims")
fig.show()


      Sex MaritalStatus  Counts
0  Female      Divorced       2
1  Female       Married     145
2  Female        Single      99
3  Female         Widow       4
4    Male      Divorced       2
5    Male       Married    1135
6    Male        Single     456
7    Male         Widow       3


In [126]:
# Number of accidents by Years and Months

df_year_month = bigdata.groupby(["Year", "Month"]).agg({"Year":"count"})
df_year_month.columns = ["Counts"]
df_year_month.reset_index(inplace=True)
print(df_year_month.head(10))

fig = px.bar(df_year_month, x="Year", y="Counts", color="Month", title="Number of accidents by Years and Months")

   Year Month  Counts
0  1994   Apr     125
1  1994   Aug     111
2  1994   Dec      71
3  1994   Feb     132
4  1994   Jan     182
5  1994   Jul      92
6  1994   Jun     127
7  1994   Mar     157
8  1994   May     129
9  1994   Nov      77


In [127]:
fig.show()

In [128]:
df_area_year = bigdata.groupby(["AccidentArea", "Year"]).agg({"AccidentArea":"count"})
df_area_year.columns = ["Counts"]
df_area_year.reset_index(inplace=True)
print(df_area_year.head(10))


fig = px.bar(df_area_year, x="Year", y="Counts",
             color='AccidentArea', barmode='group',
             height=400,
            title = "Regions where accidents occurred by year")
fig.show()

  AccidentArea  Year  Counts
0        Rural  1994     175
1        Rural  1995      42
2        Rural  1996      18
3        Urban  1994    1157
4        Urban  1995     259
5        Urban  1996     195


In [129]:


df_make_vehicle_age = bigdata.groupby(["Make", "AgeOfVehicle"]).agg({"Make":"count"})
df_make_vehicle_age.columns = ["Counts"]
df_make_vehicle_age.reset_index(inplace=True)
print(df_make_vehicle_age.head(10))
print("------------------------------------------------------------")
print("Car Brands: ")
print(df_make_vehicle_age["Make"].unique())


        Make AgeOfVehicle  Counts
0     Accura      5 years       3
1     Accura      6 years      13
2     Accura      7 years      37
3     Accura  more than 7      27
4        BMW      5 years       1
5  Chevrolet      3 years       1
6  Chevrolet      4 years       3
7  Chevrolet      5 years      12
8  Chevrolet      6 years      48
9  Chevrolet      7 years      66
------------------------------------------------------------
Car Brands: 
['Accura' 'BMW' 'Chevrolet' 'Dodge' 'Ford' 'Honda' 'Jaguar' 'Mazda'
 'Mecedes' 'Mercury' 'Nisson' 'Pontiac' 'Porche' 'Saab' 'Saturn' 'Toyota'
 'VW']


In [130]:


# Ages of vehicles involved in the accident by car brands  (consider with all years)

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_make_vehicle_age["AgeOfVehicle"], 
                     values=df_make_vehicle_age.loc[df_make_vehicle_age["Make"] == "Accura"]["Counts"], 
                     name="Accura"),
              1, 1)
fig.add_trace(go.Pie(labels=df_make_vehicle_age["AgeOfVehicle"], 
                     values=df_make_vehicle_age.loc[df_make_vehicle_age["Make"] == "BMW"]["Counts"], 
                     name="BMW"),
              1, 2)
fig.add_trace(go.Pie(labels=df_make_vehicle_age["AgeOfVehicle"], 
                     values=df_make_vehicle_age.loc[df_make_vehicle_age["Make"] == "Chevrolet"]["Counts"], 
                     name="Chevrolet"),
              1, 3)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Ages of vehicles involved in the accident by car brands",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Accura', x=0.12, y=0.5, font_size=10, showarrow=False),
                 dict(text='BMW', x=0.50, y=0.5, font_size=10, showarrow=False),
                dict(text='Chevrolet', x=0.89, y=0.5, font_size=10, showarrow=False)])
fig.show()



# Modelling

In [131]:
# We have 0 values only in Age column
print(bigdata['Age'].unique()==0)
len(bigdata[bigdata['Age']==0])

[False False False False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False]


65

In [132]:
# Make a copy of original dataframe for preprocessing
df_temp = bigdata.copy()
# Finding columns which contains strings
for labels, content in df_temp.items():
    if pd.api.types.is_string_dtype(content):
        print(labels)

Month
DayOfWeek
Make
AccidentArea
DayOfWeekClaimed
MonthClaimed
Sex
MaritalStatus
Fault
PolicyType
VehicleCategory
VehiclePrice
Days_Policy_Accident
Days_Policy_Claim
PastNumberOfClaims
AgeOfVehicle
AgeOfPolicyHolder
PoliceReportFiled
WitnessPresent
AgentType
NumberOfSuppliments
AddressChange_Claim
NumberOfCars
BasePolicy


In [133]:
# Now Let's Fill 0 value with median of the column
df_temp["Age"] = df_temp["Age"].replace(0,df["Age"].median())

In [134]:
# Let's Check if we have 0 in Age
print(df_temp['Age'].unique()==0)
len(df_temp[df_temp['Age']==0])

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]


0

In [135]:
# Now turn all string into categorical features
for labels, content in df_temp.items():
    if pd.api.types.is_string_dtype(content):
        df_temp[labels] = content.astype("category").cat.as_ordered()

In [136]:
# Let's Check how many columns changed into Category
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1846 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Month                 1846 non-null   category
 1   WeekOfMonth           1846 non-null   int64   
 2   DayOfWeek             1846 non-null   category
 3   Make                  1846 non-null   category
 4   AccidentArea          1846 non-null   category
 5   DayOfWeekClaimed      1846 non-null   category
 6   MonthClaimed          1846 non-null   category
 7   WeekOfMonthClaimed    1846 non-null   int64   
 8   Sex                   1846 non-null   category
 9   MaritalStatus         1846 non-null   category
 10  Age                   1846 non-null   int64   
 11  Fault                 1846 non-null   category
 12  PolicyType            1846 non-null   category
 13  VehicleCategory       1846 non-null   category
 14  VehiclePrice          1846 non-null   category
 15  Fra

In [137]:
# Turn Categorical Features values into numeric
for labels, content in df_temp.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing values 
       #df_temp[labels+"_is_missing"] = pd.isnull(content) # Though we don't have but we could've used if we had missing values
        df_temp[labels] = pd.Categorical(content).codes

In [138]:
df_temp.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,2,5,6,5,1,5,4,1,0,2,...,1,3,0,0,0,3,0,2,1994,2
1,4,3,6,5,1,1,4,4,1,2,...,4,4,1,0,0,3,3,0,1994,1
2,10,5,0,5,1,4,9,2,1,1,...,5,6,0,0,0,3,3,0,1994,1
3,6,2,2,15,0,0,5,1,1,1,...,6,7,1,0,0,2,3,0,1994,2
4,4,5,1,5,1,5,3,2,0,2,...,3,4,0,0,0,3,3,0,1994,1


In [139]:
df_temp.describe()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
count,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,...,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0,1846.0
mean,5.39,2.78,2.78,8.36,0.87,3.23,5.45,2.67,0.86,1.31,...,4.76,4.75,0.02,0.0,0.01,2.04,2.86,0.11,1994.39,0.77
std,3.31,1.29,2.04,4.67,0.33,2.29,3.32,1.26,0.34,0.47,...,1.21,1.51,0.13,0.07,0.09,1.13,0.52,0.42,0.69,0.71
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1994.0,0.0
25%,3.0,2.0,1.0,5.0,1.0,1.0,3.0,2.0,1.0,1.0,...,4.0,4.0,0.0,0.0,0.0,1.0,3.0,0.0,1994.0,0.0
50%,5.0,3.0,3.0,7.0,1.0,4.0,6.0,3.0,1.0,1.0,...,5.0,5.0,0.0,0.0,0.0,2.0,3.0,0.0,1994.0,1.0
75%,8.0,4.0,5.0,11.0,1.0,5.0,8.0,4.0,1.0,2.0,...,6.0,6.0,0.0,0.0,0.0,3.0,3.0,0.0,1995.0,1.0
max,11.0,5.0,6.0,16.0,1.0,6.0,11.0,5.0,1.0,3.0,...,7.0,8.0,1.0,1.0,1.0,3.0,4.0,4.0,1996.0,2.0


In [140]:
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier

In [141]:
# Split data into X & y
X_1 = df_temp.drop("FraudFound_P",axis = 1)
y_1 = df_temp["FraudFound_P"]

In [142]:
# Setup random seed for reproduction
np.random.seed(42)

# Split data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X_1,y_1,
                                                    test_size=0.2)


In [143]:
learning_rates = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

accuracy = 0

for learning_rate in learning_rates:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))
    
    if gb_clf.score(X_test, y_test) > accuracy:
        best_learning_rate = learning_rate
    
    accuracy = gb_clf.score(X_test, y_test)

print("\nBest learning rate: ", best_learning_rate)



Learning rate:  0.05
Accuracy score (training): 0.958
Accuracy score (validation): 0.957
Learning rate:  0.075
Accuracy score (training): 0.958
Accuracy score (validation): 0.957
Learning rate:  0.1
Accuracy score (training): 0.959
Accuracy score (validation): 0.957
Learning rate:  0.25
Accuracy score (training): 0.961
Accuracy score (validation): 0.957
Learning rate:  0.5
Accuracy score (training): 0.961
Accuracy score (validation): 0.959
Learning rate:  0.75
Accuracy score (training): 0.963
Accuracy score (validation): 0.965
Learning rate:  1
Accuracy score (training): 0.969
Accuracy score (validation): 0.957

Best learning rate:  0.75


Determine the optimum number of trees for this learning rate


In [144]:
number_of_trees = np.arange(20, 81, 10)
accuracy = 0


for n_estimator in number_of_trees:
    gb_clf = GradientBoostingClassifier(n_estimators=n_estimator, learning_rate=best_learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Number of trees: ", n_estimator)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))
    
    if gb_clf.score(X_test, y_test) > accuracy:
        optimum_estimators = n_estimator
    
    accuracy = gb_clf.score(X_test, y_test)
    
print("\nOptimum number of trees: ", optimum_estimators)
    


Number of trees:  20
Accuracy score (training): 0.963
Accuracy score (validation): 0.965
Number of trees:  30
Accuracy score (training): 0.972
Accuracy score (validation): 0.965
Number of trees:  40
Accuracy score (training): 0.982
Accuracy score (validation): 0.970
Number of trees:  50
Accuracy score (training): 0.986
Accuracy score (validation): 0.970
Number of trees:  60
Accuracy score (training): 0.990
Accuracy score (validation): 0.968
Number of trees:  70
Accuracy score (training): 0.992
Accuracy score (validation): 0.970
Number of trees:  80
Accuracy score (training): 0.993
Accuracy score (validation): 0.973

Optimum number of trees:  80


RandomForesClassifier has the highest accuracy so we will move forward with that model

In [145]:
maximum_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9]

accuracy = 0


for max_depth in maximum_depth:
    gb_clf = GradientBoostingClassifier(n_estimators=optimum_estimators, learning_rate=best_learning_rate, max_features=2, max_depth=max_depth, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Maximum depth: ", max_depth)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))
    
    if gb_clf.score(X_test, y_test) > accuracy:
        optimum_depth = max_depth
    
    accuracy = gb_clf.score(X_test, y_test)
    
print("\nOptimum number of trees: ", optimum_depth)

Maximum depth:  1
Accuracy score (training): 0.977
Accuracy score (validation): 0.965
Maximum depth:  2
Accuracy score (training): 0.993
Accuracy score (validation): 0.973
Maximum depth:  3
Accuracy score (training): 1.000
Accuracy score (validation): 0.973
Maximum depth:  4
Accuracy score (training): 1.000
Accuracy score (validation): 0.957
Maximum depth:  5
Accuracy score (training): 1.000
Accuracy score (validation): 0.965
Maximum depth:  6
Accuracy score (training): 1.000
Accuracy score (validation): 0.959
Maximum depth:  7
Accuracy score (training): 1.000
Accuracy score (validation): 0.968
Maximum depth:  8
Accuracy score (training): 1.000
Accuracy score (validation): 0.965
Maximum depth:  9
Accuracy score (training): 1.000
Accuracy score (validation): 0.965

Optimum number of trees:  7


In [149]:
gb_clf = GradientBoostingClassifier(n_estimators=optimum_estimators, learning_rate=best_learning_rate, max_features=2, max_depth=max_depth, random_state=0)
gb_clf.fit(X_train, y_train)
predictions = gb_clf.predict(X_test)

In [150]:
final_df = X_test.copy()

In [151]:
final_df['Predictions'] = predictions

In [152]:
final_df['Actual'] = y_test

In [153]:
final_df

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,Predictions,Actual
10392,3,4,1,7,0,6,7,1,1,2,...,0,0,0,3,3,0,1995,1,1,1
3598,8,2,4,15,1,5,8,4,1,1,...,0,0,0,0,3,0,1994,0,1,1
858,1,4,1,11,1,5,1,4,1,2,...,0,0,0,2,3,0,1994,0,0,0
9808,3,1,4,15,1,1,3,1,1,1,...,0,0,0,3,3,0,1995,0,1,1
6087,11,1,0,15,1,5,11,2,1,1,...,0,0,0,3,3,0,1994,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,0,1,2,15,1,5,0,1,1,1,...,0,0,0,0,3,0,1994,1,0,0
812,8,4,0,2,1,5,5,3,1,1,...,0,0,0,3,3,0,1994,1,0,0
962,0,1,1,11,1,5,0,1,1,1,...,0,0,0,3,3,0,1994,1,0,0
759,7,1,0,2,1,1,7,1,0,2,...,0,0,0,3,3,0,1994,1,0,0


# Convert data to CSV

In [154]:
df_insurance_fraud = pd.DataFrame(final_df)

In [155]:
file_name = r'C:\Users\isarachchand\Documents\git\apf\output\fraud\predict_insurance_fraud\vehicle_fraud_predictions.csv'
df_insurance_fraud.to_csv(file_name,encoding='utf-8', index=False)