In [0]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive


In [16]:
trainfile = r'/gdrive/My Drive/assignment2/Insurance Fraud - TRAIN-3000.csv'
trainData = pd.read_csv(trainfile)
testfile = r'/gdrive/My Drive/assignment2/Insurance Fraud -TEST-12900.csv'
testData = pd.read_csv(testfile)

print(trainData.shape)
print(testData.shape)

trainData.head()
testData.head()

(2999, 32)
(12918, 32)


Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,AGE,FAULT,POLICYTYPE,VEHICLECATEGORY,VEHICLEPRICE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,DAYS_POLICY_ACCIDENT,DAYS_POLICY_CLAIM,PASTNUMBEROFCLAIMS,AGEOFVEHICLE,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
0,Jul,3,Sunday,Honda,Rural,Wednesday,Jan,4,Male,Married,21,Policy_Holder,Sport-Collision,Sport,more_than_69000,4,400,4,more_than_30,more_than_30,none,4_years,26_to_30,No,No,External,3_to_5,no_change,1-vehicle,1994,Collision,Yes
1,Nov,5,Monday,Mazda,Urban,Wednesday,Dec,1,Male,Single,68,Policy_Holder,Sedan-All_Perils,Sedan,20000_to_29000,9,400,3,more_than_30,more_than_30,2_to_4,5_years,over_65,No,No,External,none,no_change,1-vehicle,1994,All_Perils,Yes
2,Jan,1,Monday,Pontiac,Urban,Wednesday,Jan,1,Male,Married,50,Policy_Holder,Sedan-All_Perils,Sedan,20000_to_29000,8,400,2,more_than_30,more_than_30,none,7_years,41_to_50,No,No,External,none,under_6_months,1-vehicle,1994,All_Perils,Yes
3,Dec,1,Monday,Toyota,Rural,Tuesday,May,3,Male,Married,39,Policy_Holder,Sedan-All_Perils,Sedan,30000_to_39000,1,400,3,more_than_30,more_than_30,none,more_than_7,36_to_40,No,No,External,more_than_5,under_6_months,2-vehicles,1994,All_Perils,Yes
4,Dec,5,Wednesday,Pontiac,Urban,Wednesday,Jan,1,Male,Single,43,Policy_Holder,Sedan-Collision,Sedan,40000_to_59000,1,400,4,more_than_30,more_than_30,2_to_4,7_years,36_to_40,No,No,External,more_than_5,no_change,1-vehicle,1994,Collision,Yes


In [18]:
trainData.columns

Index(['WEEKOFMONTH', 'WEEKOFMONTHCLAIMED', 'AGE', 'REPNUMBER', 'DEDUCTIBLE',
       'DRIVERRATING', 'YEAR', 'FRAUDFOUND', 'MONTH_Apr', 'MONTH_Aug',
       ...
       'ADDRESSCHANGE_CLAIM_no_change', 'ADDRESSCHANGE_CLAIM_under_6_months',
       'NUMBEROFCARS_1-vehicle', 'NUMBEROFCARS_2-vehicles',
       'NUMBEROFCARS_3_to_4', 'NUMBEROFCARS_5_to_8',
       'NUMBEROFCARS_more_than_8', 'BASEPOLICY_All_Perils',
       'BASEPOLICY_Collision', 'BASEPOLICY_Liability'],
      dtype='object', length=145)

In [0]:
categoricalFeatures = ['MONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA',
       'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY',
       'VEHICLEPRICE', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS',
       'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED',
       'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS',
       'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'BASEPOLICY']
combined_Data = pd.concat([trainData,testData], keys=[0,1])
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)
trainData = combined_Data.xs(0)
testData = combined_Data.xs(1)
y_train = trainData["FRAUDFOUND"]
X_train = trainData.drop(["FRAUDFOUND"], axis=1)
y_test = testData["FRAUDFOUND"]
X_test = testData.drop(["FRAUDFOUND"], axis=1)


In [19]:
#Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)

print("accuracy Score (training) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print(classification_report(y_test,clf_predict))

accuracy Score (training) for Decision Tree:0.882257
Confusion Matrix for Decision Tree
[[10946  1474]
 [   47   451]]
              precision    recall  f1-score   support

          No       1.00      0.88      0.94     12420
         Yes       0.23      0.91      0.37       498

    accuracy                           0.88     12918
   macro avg       0.62      0.89      0.65     12918
weighted avg       0.97      0.88      0.91     12918



In [20]:
#Hyperparameter tuning done for decision tree classifier
#random search
print("RandomizedSearchCV-Decision tree")
parameters={'max_depth': range(5,50,5),'max_leaf_nodes': range(10,50,10),'criterion':['gini','entropy'],'min_samples_split': range(5,55,10)}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)
#Now do grid search
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)

RandomizedSearchCV-Decision tree
{'min_samples_split': 45, 'max_leaf_nodes': 30, 'max_depth': 5, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 5, 'max_leaf_nodes': 10, 'min_samples_split': 5}


In [0]:
#Construct Decision Trees using the best parameters
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)

In [9]:
#Obtain accuracy, confusion matrix, classification report and AUC values
print("accuracy Score (training) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("accuracy Score (training) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clf_cv_score)

print(confusion_matrix(y_test,clfr_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clfr_predict))

clfr_cv_score = cross_val_score(clfr, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clfr_cv_score)


accuracy Score (training) after hypertuning randomized search for Decision Tree:0.929246
accuracy Score (training) after hypertuning grid search for Decision Tree:0.929633
Confusion Matrix after hypertuning for Decision Tree
[[11865   555]
 [  359   139]]
=== Classification Report ===
              precision    recall  f1-score   support

          No       0.97      0.96      0.96     12420
         Yes       0.20      0.28      0.23       498

    accuracy                           0.93     12918
   macro avg       0.59      0.62      0.60     12918
weighted avg       0.94      0.93      0.93     12918

[0.67980769 0.76538462 0.84519231 0.65       0.62307692 0.525
 0.49807692 0.5        0.57115385 0.6025641 ]
[[11870   550]
 [  359   139]]
=== Classification Report ===
              precision    recall  f1-score   support

          No       0.97      0.96      0.96     12420
         Yes       0.20      0.28      0.23       498

    accuracy                           0.93     12918


In [10]:
#Normal randomforest

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (training) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print(classification_report(y_test,rfc_predict))



accuracy Score (training) for RandomForest:0.949837
Confusion Matrix for Random Forest:
[[11885   535]
 [  113   385]]
              precision    recall  f1-score   support

          No       0.99      0.96      0.97     12420
         Yes       0.42      0.77      0.54       498

    accuracy                           0.95     12918
   macro avg       0.70      0.87      0.76     12918
weighted avg       0.97      0.95      0.96     12918



In [13]:
rand_parameters={'criterion':['gini','entropy'],'max_depth': range(5,50,5),'min_samples_leaf': range(4,24,4),'max_features':[2,3,4]}
#random search with cross-validation
print("RandomizedSearchCV-Decision tree")
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=30,cv=10)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
#grid search
print("GridSearchCV-Decision tree")
rfc_grid = GridSearchCV(rfc,rand_parameters)
rfc_grid.fit(X_train, y_train)
grid_parm_rfc1=rfc_grid.best_params_
print(grid_parm_rfc1)


RandomizedSearchCV-Decision tree
{'min_samples_leaf': 4, 'max_features': 4, 'max_depth': 30, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 30, 'max_features': 4, 'min_samples_leaf': 4}


In [14]:
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfcr= RandomForestClassifier(**grid_parm_rfc1)
rfcr.fit(X_train,y_train)
rfcr_predict = rfcr.predict(X_test)
print("accuracy Score (training) after hypertuning random search for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("accuracy Score (training) after hypertuning grid search for Random Forest:{0:6f}".format(rfcr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=10)
print(rfc_cv_score)

print(confusion_matrix(y_test,rfcr_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfcr_predict))
rfcr_cv_score = cross_val_score(rfcr, X_train, y_train, cv=10)
print(rfcr_cv_score)


accuracy Score (training) after hypertuning random search for Random Forest:0.959901
accuracy Score (training) after hypertuning grid search for Random Forest:0.960985
Confusion Matrix after hypertuning for Random Forest:
[[12359    61]
 [  457    41]]
=== Classification Report ===
              precision    recall  f1-score   support

          No       0.96      1.00      0.98     12420
         Yes       0.40      0.08      0.14       498

    accuracy                           0.96     12918
   macro avg       0.68      0.54      0.56     12918
weighted avg       0.94      0.96      0.95     12918

[0.87       0.867      0.86686687]
[[12387    33]
 [  471    27]]
=== Classification Report ===
              precision    recall  f1-score   support

          No       0.96      1.00      0.98     12420
         Yes       0.45      0.05      0.10       498

    accuracy                           0.96     12918
   macro avg       0.71      0.53      0.54     12918
weighted avg       0.9