In [1]:
import pandas as pd
import csv
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("Root_Insurance_data.csv",delimiter = ",")

In [3]:
## Input: Data Frame (in the format of "Root_Insurance_data.csv") and encode_rank (default = False)
## Output: Data Frame (processed by one hot encoding ready to be used to make models)

## This function will one hot encode the following columns in the data frame: 
## Currently Insured, Marital Status, click, rank (optional, determined by encode_rank = True)
## and will drop the respective columns associated to the respective values of above columns:
## unknown, S, False, 5.0 (if encode_rank = True)

import category_encoders as ce

def prepare_data(data:pd.DataFrame(), encode_rank = False, drop_column = False) -> pd.DataFrame():
    temp = data.copy()
    
    if encode_rank:
        one_hot_cols = ["Currently Insured", "Marital Status", "click", "rank"]
        drop_cols = ["Currently Insured_unknown", "Marital Status_S", "rank_5.0", "click_False"]
        column_names = ['Currently Insured_Y', 'Currently Insured_N', 'Number of Vehicles',
                        'Number of Drivers', 'Marital Status_M', 'bid', 'rank_1.0',
                        'rank_2.0', 'rank_3.0', 'rank_4.0', 'click_True', 'policies_sold']
    else:
        one_hot_cols = ["Currently Insured", "Marital Status", "click"]
        drop_cols = ["Currently Insured_unknown", "Marital Status_S", "click_False"]
        column_names = ['Currently Insured_Y', 'Currently Insured_N', 'Number of Vehicles','Number of Drivers',
                        'Marital Status_M', 'bid', 'rank', 'click_True', 'policies_sold']
    
    encoder = ce.OneHotEncoder(cols = one_hot_cols, use_cat_names = True)
    temp = encoder.fit_transform(temp)
    temp = temp.drop(columns = drop_cols)
    temp = temp.reindex(columns=column_names)
    return temp

In [4]:
df = prepare_data(data)

  elif pd.api.types.is_categorical(cols):


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve, accuracy_score

features = ['Currently Insured_Y',
            'Currently Insured_N',
            'Number of Vehicles',
            'Number of Drivers',
            'Marital Status_M',
            'bid',
            'rank']
predictor_var = "click_True"

df_train, df_test = train_test_split(df,
                                     shuffle=True,
                                     random_state=12345,
                                     test_size=.2,
                                     stratify=df[predictor_var])

In [6]:
## Allows for variantion of resampling with condition resampling = "None", "Up", "Down" or "Balanced"
## Allow for variation of scoring method for this we will use: scoring_method = 'accuracy', 'average_precision', 'roc_auc' 

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample,shuffle
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Number of trees in random forest
n_estimators = [10, 30, 100, 300, 1000, 3000]
# Number of features to consider at every split
max_features = range(1,len(features) + 1)
# Maximum number of levels in tree
max_depth = range(1,11)
# Method of selecting samples for training each tree

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=77)

def obtain_rf_param(data:pd.DataFrame(), scoring_method:str) -> []:
    df = data
    rf = RandomForestClassifier(random_state = 808)
    rf_search = GridSearchCV(estimator = rf, 
                             param_grid = random_grid, 
                             cv = kfold, 
                             scoring = scoring_method,
                             verbose=3, 
                             n_jobs = -1)
    rf_search.fit(df[features], df[predictor_var])
    
    return [scoring_method, rf_search.best_score_, rf_search.best_params_]

def obtain_rf_scores(data:pd.DataFrame(), num_of_splits = 5, resampling = "None") -> []:
    
    avg_precision_cross_val = np.zeros(num_of_splits)
    roc_auc_cross_val = np.zeros(num_of_splits) 
    accuracy_cross_val = np.zeros(num_of_splits) 

    kfold = StratifiedKFold(n_splits=num_of_splits, shuffle=True, random_state=77)
    
    i = 0
    for train_index, test_index in kfold.split(df_train[features], df_train[predictor_var]):
        df_train_train = resample_data(data.iloc[train_index], resample_type = resampling)
        df_train_val = data.iloc[test_index]
        
        
        rf_cla = RandomForestClassifier(n_estimators = 300,
                                        max_features = 3,
                                        max_depth=6, 
                                        random_state = 808)
        rf_cla.fit(df_train_train[features], df_train_train[predictor_var])

        avg_precision_cross_val[i] = average_precision_score(df_train_val[predictor_var], 
                                                             rf_cla.predict_proba(df_train_val[features])[:,1])
        roc_auc_cross_val[i] = roc_auc_score(df_train_val[predictor_var], 
                                             rf_cla.predict_proba(df_train_val[features])[:,1])
        accuracy_cross_val[i] = accuracy_score(df_train_val[predictor_var], 
                                               rf_cla.predict(df_train_val[features]))
        i += 1
    
    return [avg_precision_cross_val, roc_auc_cross_val, accuracy_cross_val]

def resample_data(df:pd.DataFrame(), resample_type = "None") -> pd.DataFrame():
    df_minority = df[df[predictor_var] == 1]
    df_majority = df[df[predictor_var] != 1]
    if resample_type == "Up":
        df_minority_up = resample(df_minority,
                                  random_state=886,
                                  n_samples=len(df_majority),
                                  replace=True)
        df_resampled = pd.concat([df_minority_up, df_majority]).sample(frac=1, random_state = 0)
    elif resample_type == "Down":
        df_majority_down = resample(df_majority,
                                    random_state=886,
                                    n_samples=len(df_minority),
                                    replace=True)
        df_resampled = pd.concat([df_majority_down, df_minority]).sample(frac=1, random_state = 0)
    elif resample_type == "Balanced":
        df_majority_bal = resample(df_majority,
                                           random_state=886,
                                           n_samples= int(len(df)/2),
                                           replace=True)
        df_minority_bal = resample(df_minority,
                                               random_state=886,
                                               n_samples=int(len(df)/2),
                                               replace=True)
        df_resampled = pd.concat([df_majority_bal, df_minority_bal]).sample(frac=1, random_state = 0) 
    else:
        df_resampled = df
    return df_resampled

        

In [7]:
print(obtain_rf_param(df_train, scoring_method = 'average_precision'))
print(obtain_rf_param(df_train, scoring_method = "roc_auc"))
print(obtain_rf_param(df_train, scoring_method = "accuracy"))

Fitting 5 folds for each of 420 candidates, totalling 2100 fits
['average_precision', 0.4525124866438704, {'max_depth': 3, 'max_features': 5, 'n_estimators': 300}]
Fitting 5 folds for each of 420 candidates, totalling 2100 fits
['roc_auc', 0.7986364700047434, {'max_depth': 4, 'max_features': 7, 'n_estimators': 10}]
Fitting 5 folds for each of 420 candidates, totalling 2100 fits
['accuracy', 0.8238749999999999, {'max_depth': 7, 'max_features': 3, 'n_estimators': 30}]


### Using above parameters, Testing resampling

In [8]:
num_of_splits = 5
avg_precision_cross_val = np.zeros(num_of_splits)
roc_auc_cross_val = np.zeros(num_of_splits) 
accuracy_cross_val = np.zeros(num_of_splits)  

kfold = StratifiedKFold(n_splits=num_of_splits, shuffle=True, random_state=77)

i = 0
for train_index, test_index in kfold.split(df_train[features], df_train[predictor_var]):
    df_train_train = resample_data(df_train.iloc[train_index], resample_type = "Up")
    df_train_val = df_train.iloc[test_index]
    
    rf_cla = RandomForestClassifier(n_estimators = 300,
                            max_features = 5,
                            max_depth=3, 
                            random_state = 808)
    rf_cla.fit(df_train_train[features], df_train_train[predictor_var])

    avg_precision_cross_val[i] = average_precision_score(df_train_val[predictor_var], 
                                                         rf_cla.predict_proba(df_train_val[features])[:,1])
    roc_auc_cross_val[i] = roc_auc_score(df_train_val[predictor_var], 
                                         rf_cla.predict_proba(df_train_val[features])[:,1])
    accuracy_cross_val[i] = accuracy_score(df_train_val[predictor_var], 
                                           rf_cla.predict(df_train_val[features]))
    i += 1

In [9]:
print(avg_precision_cross_val)
print(roc_auc_cross_val)
print(accuracy_cross_val)

[0.44804297 0.46041125 0.43609563 0.47155614 0.43829319]
[0.79931406 0.79793554 0.79344487 0.80687051 0.78651795]
[0.7375   0.761875 0.749375 0.75875  0.755   ]


In [10]:
def info(avg_precision_cross_val, roc_auc_cross_val, accuracy_cross_val):
    print("Average cross val AUCPR = " + str(np.average(avg_precision_cross_val)))
    print()
    print("Average cross val AUCROC = " + str(np.average(roc_auc_cross_val)))
    print()
    print("Average cross val Accuracy = " + str(np.average(accuracy_cross_val)))

### No Resample Case

In [11]:
avg_precision_cross_val, roc_auc_cross_val, accuracy_cross_val = obtain_rf_scores(df_train)
info(avg_precision_cross_val, roc_auc_cross_val, accuracy_cross_val)

Average cross val AUCPR = 0.45090075694613246

Average cross val AUCROC = 0.7960129162439329

Average cross val Accuracy = 0.8227500000000001


### Upsample Case

In [12]:
avg_precision_cross_val_up, roc_auc_cross_val_up, accuracy_cross_val_up = obtain_rf_scores(df_train, resampling = "Up")
info(avg_precision_cross_val_up, roc_auc_cross_val_up, accuracy_cross_val_up)

Average cross val AUCPR = 0.4480204608901244

Average cross val AUCROC = 0.7950765526451278

Average cross val Accuracy = 0.745125


### Downsample Case

In [13]:
avg_precision_cross_val_down, roc_auc_cross_val_down, accuracy_cross_val_down = obtain_rf_scores(df_train, resampling = "Down")
info(avg_precision_cross_val_down, roc_auc_cross_val_down, accuracy_cross_val_down)

Average cross val AUCPR = 0.440283879285963

Average cross val AUCROC = 0.7923455348164377

Average cross val Accuracy = 0.7322500000000001


### Balanced Sampling Case

In [14]:
avg_precision_cross_val_bal, roc_auc_cross_val_bal, accuracy_cross_val_bal = obtain_rf_scores(df_train, resampling = "Balanced")
info(avg_precision_cross_val_bal, roc_auc_cross_val_bal, accuracy_cross_val_bal)

Average cross val AUCPR = 0.4471759847829769

Average cross val AUCROC = 0.7940553079675395

Average cross val Accuracy = 0.73725
