# Question One - Fraud Detection 

##### Import Required Libraries

In [1]:
# Reading / Writing Files
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split

# Encoding
from sklearn.preprocessing import OneHotEncoder

# Models / Functions / Ensembles
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Accuracy Metrics
from sklearn.metrics import classification_report, confusion_matrix

# Miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

##### Load in Data, Create Dataframes and Perform Basic Data Cleaning & Operations

In [2]:
insurance_fraud_train_df = pd.read_csv('data/assignment_two/insurance_fraud_train.csv').dropna(axis='columns', how='all')
insurance_fraud_test_df = pd.read_csv('data/assignment_two/insurance_fraud_test.csv').dropna(axis='columns', how='all')

In [3]:
print(f"Observations in Train DF: {len(insurance_fraud_train_df)}.")
print(f"Observations in Test DF: {len(insurance_fraud_test_df)}.")

# Peculiar how the Train DF is like 19% the size of all the data, our model isn't going to be trained very well..
# I'm not going to take some creative freedom on this HW..

insurance_fraud_train_df.head(1)

Observations in Train DF: 2999.
Observations in Test DF: 12918.


Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,...,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
0,Jul,3,Sunday,Honda,Rural,Wednesday,Jan,4,Male,Married,...,26_to_30,No,No,External,3_to_5,no_change,1-vehicle,1994,Collision,Yes


In [4]:
concat_df = pd.concat([insurance_fraud_train_df,insurance_fraud_test_df])
concat_df['FRAUDFOUND'] = concat_df['FRAUDFOUND'].str.replace('No','0').replace('Yes','1')

# Clearly based on the information above, "One-Hot-Encoding" Will Be Necessary..
# Pandas has gotten significantly better at encoding (since Python 3+)
# Thus, won't even need to use Sklearn's OneHotEncoding function, just use pd.get_dummies
# See below

encoding_columns = concat_df.keys().tolist()

In [5]:
# Train / Test Split using Sklearn's function.
X = concat_df.drop('FRAUDFOUND',axis=1)
X_encoded = pd.get_dummies(X, columns=X.keys().tolist())
y = concat_df[['FRAUDFOUND']]

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, train_size=0.80, random_state=42)

## Create a Model

#### Sklearn Basic Decision Tree Model

In [6]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
X_test_prediction = decision_tree_model.predict(X_test)

print(f"Accuracy Score for Basic Decision Tree: {round(decision_tree_model.score(X_test,y_test),6)}")

print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,X_test_prediction))

Accuracy Score for Basic Decision Tree: 0.957286
Confusion Matrix for Decision Tree:
[[2911   85]
 [  51  137]]


#####  Hyperparameter Tuning an Sklearn Decision Tree Model

In [7]:
parameters = {
    'min_samples_split' : [i*50 for i in range(1,25)],
    'criterion' : ['gini','entropy'],
    'max_leaf_nodes' : [i*2 for i in range(1,10)]
}

In [8]:
# Grid Search
decision_tree_model_grid = GridSearchCV(decision_tree_model,parameters,verbose=True)
decision_tree_model_grid.fit(X_train, y_train)

grid_parameter_soln = decision_tree_model_grid.best_params_
print(f"Grid Search Optimal Parameters: {grid_parameter_soln}")

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Grid Search Optimal Parameters: {'criterion': 'entropy', 'max_leaf_nodes': 16, 'min_samples_split': 250}


[Parallel(n_jobs=1)]: Done 1296 out of 1296 | elapsed:  2.0min finished


In [9]:
# Random Search
decision_tree_model_random = RandomizedSearchCV(decision_tree_model,parameters,n_iter=100,cv=5,verbose=True)
decision_tree_model_random.fit(X_train, y_train)
random_parameter_soln = decision_tree_model_random.best_params_
print(f"Grid Search Optimal Parameters: {random_parameter_soln}")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Grid Search Optimal Parameters: {'min_samples_split': 400, 'max_leaf_nodes': 8, 'criterion': 'gini'}


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   55.9s finished


##### Using Models with Hyperparameter Solutions

In [10]:
# Grid
hyperparameter_grid = DecisionTreeClassifier(**grid_parameter_soln)

hyperparameter_grid.fit(X_train,y_train)
hyperparameter_grid_results = hyperparameter_grid.predict(X_test)

In [11]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(hyperparameter_grid.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Grid Search for Decision Tree: 0.941897


In [12]:
# Random
hyperparameter_random = DecisionTreeClassifier(**random_parameter_soln)

hyperparameter_random.fit(X_train,y_train)
hyperparameter_random_results = hyperparameter_random.predict(X_test)

In [13]:
print(f"Accuracy Score After Hypertuning Random Search for Decision Tree: {round(hyperparameter_random.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Random Search for Decision Tree: 0.941269


Hyperparameter tuning with Grid Search produced marginally better results than random search results.
This is to be expected, as Grid Search will produce more accurate results at the expense of additional resources (time). Let's see the results of Hyperparameter Tuning a Decision Tree using Grid Search.

In [14]:
print("Confusion Matrix After Hypertuning for Decision Tree")
print(confusion_matrix(y_test,hyperparameter_grid_results))

Confusion Matrix After Hypertuning for Decision Tree
[[2984   12]
 [ 173   15]]


In [15]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,hyperparameter_grid_results))

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      2996
           1       0.56      0.08      0.14       188

    accuracy                           0.94      3184
   macro avg       0.75      0.54      0.55      3184
weighted avg       0.92      0.94      0.92      3184



##### Cross Validation for Hyperparameterized Grid Search

In [16]:
classifier_cross_validation_score = cross_val_score(hyperparameter_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {classifier_cross_validation_score}")

Cross Validation for Grid Search:
 [0.54756613 0.53448276 0.51724138 0.55068464 0.53448276 0.53448276
 0.51620188 0.51308338 0.55068464 0.497921   0.51785714 0.55253193
 0.51577814 0.53571429 0.55149243 0.53467478 0.51785714 0.55357143
 0.55149243 0.54837392 0.53363528 0.53363528 0.53467478 0.55149243
 0.51369048]


##### Sklearn Basic Random Forest Model

In [17]:
random_parameters = {
    'min_samples_leaf' : [i*2 for i in range(1,5)],
    'max_depth': [i*5 for i in range(1,5)],
    'max_features':[i*10 for i in range(2,7)],
    'n_estimators':[i*2 for i in range(1,5)]
}

In [18]:
random_forest = RandomForestClassifier()

In [19]:
random_forest.fit(X_train, y_train)
random_forest_soln = random_forest.predict(X_test)

In [20]:
print(f"Accuracy Score for Default Random Forest: {round(random_forest.score(X_test,y_test),6)}")

Accuracy Score for Default Random Forest: 0.97299


In [21]:
print("Confusion Matrix for Default Random Forest:")
print(confusion_matrix(y_test,random_forest_soln))

Confusion Matrix for Default Random Forest:
[[2994    2]
 [  84  104]]


##### Random Forest Grid Search with Cross-Validation

In [22]:
random_forest_grid = GridSearchCV(random_forest,random_parameters,verbose=True)

random_forest_grid.fit(X_train,y_train)
random_forest_grid_soln = random_forest_grid.predict(X_test)

print(random_forest_grid.best_params_)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 960 out of 960 | elapsed:  2.1min finished


{'max_depth': 20, 'max_features': 60, 'min_samples_leaf': 2, 'n_estimators': 6}


In [23]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(random_forest_grid.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Grid Search for Decision Tree: 0.951005


##### Random Forest Random Search

In [24]:
random_forest_random = RandomizedSearchCV(random_forest,random_parameters,n_iter=10,cv=5)

In [25]:
random_forest_random.fit(X_train, y_train)
random_forest_random_soln = random_forest_random.best_params_

In [26]:
print(random_forest_random_soln)
print(f"Accuracy Score for Random Forest with Random Search: {round(random_forest_random.score(X_test,y_test),6)}")

{'n_estimators': 6, 'min_samples_leaf': 2, 'max_features': 40, 'max_depth': 20}
Accuracy Score for Random Forest with Random Search: 0.950063


In [27]:
print("Confusion Matrix After Hypertuning for Decision Tree")
print(confusion_matrix(y_test,hyperparameter_grid_results))

Confusion Matrix After Hypertuning for Decision Tree
[[2984   12]
 [ 173   15]]


Hyperparameter tuning with Grid Search is once again slightly more accurate than random search. 

In [28]:
best_forest_grid = RandomForestClassifier(**random_forest_grid.best_params_)

best_forest_grid.fit(X_train, y_train)
best_forest_grid_soln = best_forest_grid.predict(X_test)

In [29]:
# Accuracy Check

In [30]:
print("Confusion Matrix Using Best Parameters")
print(confusion_matrix(y_test,best_forest_grid_soln))

Confusion Matrix Using Best Parameters
[[2987    9]
 [ 143   45]]


In [31]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,best_forest_grid_soln))

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      2996
           1       0.83      0.24      0.37       188

    accuracy                           0.95      3184
   macro avg       0.89      0.62      0.67      3184
weighted avg       0.95      0.95      0.94      3184



##### Cross-Validation Using Best Parameters from Grid Search

In [32]:
cross_validation_score = cross_val_score(best_forest_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {cross_validation_score}")

Cross Validation for Grid Search:
 [0.61861065 0.61965015 0.56792602 0.63585203 0.5862069  0.61965015
 0.56792602 0.58204889 0.65413291 0.58720671 0.58824621 0.65967478
 0.57142857 0.64077814 0.69434957 0.6239605  0.55253193 0.64077814
 0.64077814 0.63973864 0.65967478 0.64181764 0.64181764 0.67545293
 0.58720238]
