In [39]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt

In [6]:
insurance_df=pd.read_csv('insurance_claims.csv')
insurance_df
insurance_df.shape
y=insurance_df['fraud_reported']
y
insurance_df.drop(columns='fraud_reported',inplace=True)
insurance_df.describe()
insurance_df.corr()

In [14]:
#data cleaning
insurance_df=insurance_df.replace('?',np.NaN)
insurance_df.isnull().sum()


In [22]:
# missing value treatment using fillna

# we will replace the '?' by the most common collision type as we are unaware of the type.
insurance_df['collision_type'].fillna(insurance_df['collision_type'].mode()[0], inplace = True)


# It may be the case that there are no responses for property damage then we might take it as No property damage.
insurance_df['property_damage'].fillna('NO', inplace = True)

# again, if there are no responses fpr police report available then we might take it as No report available
insurance_df['police_report_available'].fillna('NO', inplace = True)

insurance_df.isnull().sum()
insurance_df.isnull().any()
insurance_df.isnull().any().any()

False

In [30]:
# Train Test split
x_train,x_test,y_train,y_test=train_test_split(insurance_df,y,random_state=99,test_size=0.2)
con_cols=[col for col in insurance_df.columns if insurance_df[col].dtype=='int64' or insurance_df[col].dtype=='float64']
cat_cols=[col for col in insurance_df.columns if insurance_df[col].dtype=='object']

In [31]:
# Filling of missing values
for col in con_cols:
    x_train[col].fillna(x_train[col].mean(),inplace=True)
    x_test[col].fillna(x_train[col].mean(),inplace=True)
    
for col in cat_cols:
    x_train[col].fillna(x_train[col].mode()[0],inplace=True)
    x_test[col].fillna(x_train[col].mode()[0],inplace=True)

In [32]:
# Scaling of Variables

scaler=StandardScaler()

for col in con_cols:
    x_train[col]=scaler.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col]=scaler.transform(np.array(x_test[col]).reshape(-1,1))

In [33]:
# One Hot Encoding of Categorical Variables
cat_encd_train=pd.get_dummies(x_train[cat_cols])
cat_encd_test=pd.get_dummies(x_test[cat_cols])

In [34]:
#aligning train & test data one hot encoded catg columns due to unqual no of columns
cat_encd_train_final,cat_encd_test_final=cat_encd_train.align(cat_encd_test,join='inner',axis=1)
cat_encd_test_final

Unnamed: 0,policy_bind_date_03-01-2004,policy_bind_date_04-05-2000,policy_bind_date_04-06-2000,policy_bind_date_05-08-1992,policy_bind_date_14-07-1997,policy_bind_date_14-12-1991,policy_bind_date_16-05-2008,policy_bind_date_20-07-1991,policy_bind_date_24-06-1990,policy_bind_date_25-05-1990,...,auto_model_Pathfinder,auto_model_RAM,auto_model_RSX,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6
890,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
609,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
113,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# Concatenating continous an categorical columns 
x_train_final=pd.concat([x_train[con_cols],cat_encd_train_final],axis=1)
x_test_final=pd.concat([x_test[con_cols],cat_encd_test_final],axis=1)

In [36]:
logreg=LogisticRegression()
logreg.fit(x_train_final,y_train)

LogisticRegression()

In [37]:
y_test_pred=logreg.predict(x_test_final)

In [40]:
dtree=DecisionTreeClassifier()
dtree.fit(x_train_final,y_train)

DecisionTreeClassifier()

In [42]:
dtree_test_pred=dtree.predict(x_test_final)
dtree_test_pred

array(['N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'N', 'Y', 'N',
       'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'N', 'Y', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'N', 'N', 'N',
       'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y',
       'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y',
       'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y',
       'N', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'Y', 'N', 'N', 'Y',
       'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'N', 'N', 'N', 'N