# Fraud Model: Tuning Random Forest Regressor

Purpose
- tuning random forest regressor for fraud

In [1]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
# create directory to store tuning results
output_relative_dirs = ['../data/tuning']

# check if it exists as it makedir will raise an error if it does exist
for output_relative_dir in output_relative_dirs:
    if not os.path.exists(output_relative_dir):
        os.makedirs(output_relative_dir)

## Import Data

In [4]:
data = pd.read_csv('../data/curated/fraud/input/consumer_fraud_model_building_data.csv')

## Train Test Split

In [7]:
# train test split
train, valtest = train_test_split(data, train_size=0.7, random_state=19260817)
val, test = train_test_split(valtest, train_size=0.5, random_state=19260817)

## Select Variables

In [8]:
# select variable names
XTrain = train[['transact_amount_perOrder_ratio', 'transact_amount_perOrder',
    'transact_amount_perOrder_sd', 'transact_amount_perOrder_sd_ratio']]
yTrain = train[['fraud_probability']]
XVal = val[['transact_amount_perOrder_ratio', 'transact_amount_perOrder',
    'transact_amount_perOrder_sd', 'transact_amount_perOrder_sd_ratio']]
yVal = val[['fraud_probability']]
XTest = test[['transact_amount_perOrder_ratio', 'transact_amount_perOrder',
    'transact_amount_perOrder_sd', 'transact_amount_perOrder_sd_ratio']]
yTest = test[['fraud_probability']]

## Prepare combinations and try to tune

In [9]:
# get all combinations of hyperparameters
combos = list()

n_estimators = [50, 100,150]
max_depths = [6, 12, 18, 24]
max_samples = [0.25, 0.5, 0.75]
max_features = [0.25, 0.5, 0.75, 1]
ccp_alphas = [0, 0.001, 0.01, 0.1, 1, 10]

for n_estimator in n_estimators:
    for max_depth in max_depths:
        for max_sample in max_samples:
            for max_feature in max_features:
                for ccp_alpha in ccp_alphas:
                    combos.append([n_estimator, max_depth, max_sample, max_feature, ccp_alpha])

In [None]:
# tune for hyperparameters
tuning_results = pd.DataFrame()

for c in combos:
    RF = RandomForestRegressor(n_estimators = c[0],
                               max_depth = c[1],
                               max_samples = c[2],
                               max_features = c[3],
                               ccp_alpha = c[4], random_state = 19260817)
    print(RF)

    RF.fit(XTrain, yTrain)
    train_accu = RF.score(XTrain, yTrain)
    val_accu = RF.score(XVal, yVal)
    test_accu = RF.score(XTest, yTest)

    one_result = pd.DataFrame({'n_estimators': [c[0]],
                               'max_depth': [c[1]],
                               'max_samples': [c[2]],
                               'max_features': [c[3]],
                               'ccp_alpha': [c[4]],
                               'training_accuracy': [train_accu],
                               'validation_accuracy': [val_accu],
                               'testing_accuracy': [test_accu]})

    tuning_results = tuning_results.append(one_result)
    tuning_results.to_csv('../data/tuning/RFR_brute.csv')

# Export the final combination choice's model

In [9]:
# tune final model and export as pickle
final_RF = RandomForestRegressor(n_estimators = 100,
                               max_depth = 24,
                               max_samples = 0.25,
                               max_features = 0.75,
                               ccp_alpha = 0.01, random_state = 19260817)

final_RF.fit(XTrain, yTrain)
train_accu = final_RF.score(XTrain, yTrain)
val_accu = final_RF.score(XVal, yVal)
test_accu = final_RF.score(XTest, yTest)

  final_RF.fit(XTrain, yTrain)


In [10]:
import pickle

with open('../models/consumer_fraud_rfr.pickle', 'wb') as f:
    pickle.dump(final_RF,f)