# Question One - Fraud Detection 

##### Import Required Libraries

In [1]:
# Reading / Writing Files
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split

# Encoding
from sklearn.preprocessing import OneHotEncoder

# Models / Functions / Ensembles
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Accuracy Metrics
from sklearn.metrics import classification_report, confusion_matrix

# Miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

##### Load in Data, Create Dataframes and Perform Basic Data Cleaning & Operations

In [2]:
insurance_fraud_train_df = pd.read_csv('data/assignment_two/insurance_fraud_train.csv').dropna(axis='columns', how='all')
insurance_fraud_test_df = pd.read_csv('data/assignment_two/insurance_fraud_test.csv').dropna(axis='columns', how='all')

In [3]:
print(f"Observations in Train DF: {len(insurance_fraud_train_df)}.")
print(f"Observations in Test DF: {len(insurance_fraud_test_df)}.")

# Peculiar how the Train DF is like 19% the size of all the data, our model isn't going to be trained very well..
# I'm not going to take some creative freedom on this HW..

insurance_fraud_train_df.head(1)

Observations in Train DF: 2999.
Observations in Test DF: 12918.


Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,...,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
0,Jul,3,Sunday,Honda,Rural,Wednesday,Jan,4,Male,Married,...,26_to_30,No,No,External,3_to_5,no_change,1-vehicle,1994,Collision,Yes


In [4]:
concat_df = pd.concat([insurance_fraud_train_df,insurance_fraud_test_df])
concat_df['FRAUDFOUND'] = concat_df['FRAUDFOUND'].str.replace('No','0').replace('Yes','1')

# Clearly based on the information above, "One-Hot-Encoding" Will Be Necessary..
# Pandas has gotten significantly better at encoding (since Python 3+)
# Thus, won't even need to use Sklearn's OneHotEncoding function, just use pd.get_dummies
# See below

encoding_columns = concat_df.keys().tolist()

In [5]:
# Train / Test Split using Sklearn's function.
X = concat_df.drop('FRAUDFOUND',axis=1)
X_encoded = pd.get_dummies(X, columns=X.keys().tolist())
y = concat_df[['FRAUDFOUND']]

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, train_size=0.80, random_state=42)

## Create a Model

### Sklearn Basic Decision Tree Model

In [6]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
X_test_prediction = decision_tree_model.predict(X_test)

print(f"Accuracy Score for Basic Decision Tree: {round(decision_tree_model.score(X_test,y_test),6)}")

print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,X_test_prediction))

Accuracy Score for Basic Decision Tree: 0.959171
Confusion Matrix for Decision Tree:
[[2919   77]
 [  53  135]]


#####  Hyperparameter Tuning an Sklearn Decision Tree Model

In [7]:
parameters = {
    'min_samples_split' : [i*5 for i in range(1,20)],
    'criterion' : ['gini','entropy'],
    'max_leaf_nodes' : [i for i in range(5,10)]
}

##### Grid Search

In [8]:
decision_tree_model_grid = GridSearchCV(decision_tree_model,parameters,n_jobs=-1,verbose=True)
decision_tree_model_grid.fit(X_train, y_train)

grid_parameter_soln = decision_tree_model_grid.best_params_
print(f"Grid Search Optimal Parameters: {grid_parameter_soln}")

Fitting 5 folds for each of 190 candidates, totalling 950 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.1s


Grid Search Optimal Parameters: {'criterion': 'gini', 'max_leaf_nodes': 9, 'min_samples_split': 35}


[Parallel(n_jobs=-1)]: Done 950 out of 950 | elapsed:   19.9s finished


##### Random Search

In [9]:
decision_tree_model_random = RandomizedSearchCV(decision_tree_model,parameters,n_iter=25,cv=5,n_jobs=-1,verbose=True)
decision_tree_model_random.fit(X_train, y_train)
random_parameter_soln = decision_tree_model_random.best_params_
print(f"Random Search Optimal Parameters: {random_parameter_soln}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.0s


Random Search Optimal Parameters: {'min_samples_split': 20, 'max_leaf_nodes': 8, 'criterion': 'gini'}


[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.2s finished


##### Creating Models with Hyperparameter Solutions

##### Grid

In [10]:
hyperparameter_grid = DecisionTreeClassifier(**grid_parameter_soln)

hyperparameter_grid.fit(X_train,y_train)
hyperparameter_grid_results = hyperparameter_grid.predict(X_test)

In [11]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(hyperparameter_grid.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Grid Search for Decision Tree: 0.942211


##### Random

In [12]:
hyperparameter_random = DecisionTreeClassifier(**random_parameter_soln)

hyperparameter_random.fit(X_train,y_train)
hyperparameter_random_results = hyperparameter_random.predict(X_test)

In [13]:
print(f"Accuracy Score After Hypertuning Random Search for Decision Tree: {round(hyperparameter_random.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Random Search for Decision Tree: 0.942211


Hyperparameter tuning with Grid Search produced marginally better results than random search results.
This is to be expected, as Grid Search will produce more accurate results at the expense of additional resources (time). Let's see the results of Hyperparameter Tuning a Decision Tree using Grid Search.

##### Using Optimal Parameters

In [14]:
print("=== Confusion Matrix After Hypertuning for Decision Tree ===")
print(confusion_matrix(y_test,hyperparameter_grid_results))

=== Confusion Matrix After Hypertuning for Decision Tree ===
[[2990    6]
 [ 178   10]]


In [15]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,hyperparameter_grid_results))

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2996
           1       0.62      0.05      0.10       188

    accuracy                           0.94      3184
   macro avg       0.78      0.53      0.53      3184
weighted avg       0.92      0.94      0.92      3184



##### Cross Validation for Hyperparameterized Grid Search

In [16]:
classifier_cross_validation_score = cross_val_score(hyperparameter_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {classifier_cross_validation_score}")

Cross Validation for Grid Search:
 [0.54860563 0.53448276 0.51724138 0.55172414 0.53448276 0.53448276
 0.51724138 0.53344326 0.53467478 0.497921   0.51785714 0.55357143
 0.51681764 0.53571429 0.55357143 0.53467478 0.51785714 0.55357143
 0.55149243 0.54837392 0.51681764 0.53467478 0.51681764 0.55357143
 0.51515805]


### Random Forest Modeling

##### Sklearn Basic Random Forest Model

In [17]:
random_forest = RandomForestClassifier()

In [18]:
random_forest.fit(X_train, y_train)
random_forest_soln = random_forest.predict(X_test)

In [19]:
print(f"Accuracy Score for Default Random Forest: {round(random_forest.score(X_test,y_test),6)}")

Accuracy Score for Default Random Forest: 0.979271


In [20]:
print("=== Confusion Matrix for Default Random Forest ===")
print(confusion_matrix(y_test,random_forest_soln))

=== Confusion Matrix for Default Random Forest ===
[[2996    0]
 [  66  122]]


##### Random Forest Grid Search

In [21]:
random_parameters = {
    'min_samples_leaf' : [i for i in range(1,3)],
    'max_depth': [i for i in range(20,25)],
    'max_features':[i for i in range(15,25)],
    'n_estimators':[i for i in range(1,10)]
}

In [22]:
random_forest_grid = GridSearchCV(random_forest,random_parameters,n_jobs=-1,verbose=True)

random_forest_grid.fit(X_train,y_train)
random_forest_grid_soln = random_forest_grid.predict(X_test)

print(random_forest_grid.best_params_)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 852 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 2452 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3552 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 4500 out of 4500 | elapsed:  2.2min finished


{'max_depth': 24, 'max_features': 19, 'min_samples_leaf': 1, 'n_estimators': 8}


In [23]:
print(f"Accuracy Score After Hypertuning Grid Search for Decision Tree: {round(random_forest_grid.score(X_test,y_test),6)}")

Accuracy Score After Hypertuning Grid Search for Decision Tree: 0.967651


##### Random Forest Random Search

In [24]:
random_forest_random = RandomizedSearchCV(random_forest,random_parameters,n_iter=25,cv=5,n_jobs=-1,verbose=True)

In [25]:
random_forest_random.fit(X_train, y_train)
random_forest_random_soln = random_forest_random.best_params_

print(random_forest_random_soln)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.5s


{'n_estimators': 9, 'min_samples_leaf': 1, 'max_features': 16, 'max_depth': 24}


[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    4.0s finished


In [26]:
print(f"Accuracy Score for Random Forest with Random Search: {round(random_forest_random.score(X_test,y_test),6)}")

Accuracy Score for Random Forest with Random Search: 0.967337


In [27]:
print("=== Confusion Matrix After Hypertuning for Decision Tree ===")
print(confusion_matrix(y_test,hyperparameter_grid_results))

=== Confusion Matrix After Hypertuning for Decision Tree ===
[[2990    6]
 [ 178   10]]


Hyperparameter tuning with Grid Search is once again slightly more accurate than random search. 

##### Using the Best Parameters from GridSearchCV

In [28]:
best_forest_grid = RandomForestClassifier(**random_forest_grid.best_params_)

best_forest_grid.fit(X_train, y_train)
best_forest_grid_soln = best_forest_grid.predict(X_test)

In [29]:
print(f"Accuracy Score for Random Forest with Optimal Grid Search Parameters: {round(best_forest_grid.score(X_test,y_test),6)}")

Accuracy Score for Random Forest with Optimal Grid Search Parameters: 0.968593


In [30]:
print("=== Confusion Matrix Using Best Parameters ===")
print(confusion_matrix(y_test,best_forest_grid_soln))

=== Confusion Matrix Using Best Parameters ===
[[2990    6]
 [  94   94]]


In [31]:
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test,best_forest_grid_soln))

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2996
           1       0.94      0.50      0.65       188

    accuracy                           0.97      3184
   macro avg       0.95      0.75      0.82      3184
weighted avg       0.97      0.97      0.96      3184



##### Cross-Validation Using Best Parameters from Grid Search

In [32]:
cross_validation_score = cross_val_score(best_forest_grid, X_train, y_train, cv=25, scoring="balanced_accuracy")
print(f"Cross Validation for Grid Search:\n {cross_validation_score}")

Cross Validation for Grid Search:
 [0.77482257 0.75758119 0.68757617 0.80930533 0.70689655 0.79310345
 0.74033981 0.75862069 0.83824621 0.80149243 0.80253193 0.73110336
 0.73110336 0.73214286 0.75       0.67857143 0.64285714 0.8739605
 0.76785714 0.76681764 0.73110336 0.76577814 0.71324621 0.69538907
 0.80930316]
