In [11]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import timedelta
from math import log10
import re
%matplotlib inline
start_time = pd.datetime.now()

In [12]:
df = pd.read_csv('final30var_Amy.csv')

In [13]:
df.shape

(96397, 33)

In [14]:
df.head()

Unnamed: 0,Cardnum_max_30,Cardnum_mean_0,Cardnum_total_1,Cardnum_total_14,Cardnum_total_30,Merchnum_total_1,amount/amount1Cardnum30,card_merch_max_3,card_merch_max_7,card_merch_median_3,...,card_zip_median_3,card_zip_total_1,card_zip_total_14,card_zip_total_30,card_zip_total_7,Cardnum_max_0,card_merch_mean_3,Recnum,Date,Fraud
0,-0.68042,-0.26269,-0.132609,-0.369571,-0.438583,-0.185258,-0.510944,-0.137732,-0.176881,-0.182334,...,-0.19251,-0.053167,-0.102434,-0.143212,-0.0791,-0.23624,-0.181973,1,2010-01-01,0
1,-0.68042,-0.26269,-0.132609,-0.369571,-0.438583,-0.185258,-0.510944,-0.137732,-0.176881,-0.182334,...,-0.19251,-0.053167,-0.102434,-0.143212,-0.0791,-0.23624,-0.181973,2,2010-01-01,0
2,-0.68042,-0.26269,-0.132609,-0.369571,-0.438583,-0.185258,-0.510944,-0.137732,-0.176881,-0.182334,...,-0.19251,-0.053167,-0.102434,-0.143212,-0.0791,-0.23624,-0.181973,3,2010-01-01,0
3,-0.68042,-0.26269,-0.132609,-0.369571,-0.438583,-0.184382,-0.510944,-0.137732,-0.176881,-0.182334,...,-0.19251,-0.053167,-0.102434,-0.143212,-0.0791,-0.23624,-0.181973,4,2010-01-01,0
4,-0.678667,-0.255398,-0.131937,-0.369174,-0.43833,-0.183506,4.194033,-0.132589,-0.172064,-0.172925,...,-0.182961,-0.05221,-0.101516,-0.142329,-0.078164,-0.231503,-0.172922,5,2010-01-01,0


In [15]:
df_model = df[(df.Date>'2010-01-14')&(df.Date<'2010-11-01')].copy()
df_oot = df[df.Date>='2010-11-01'].copy()
df_model.drop(columns=['Date','Recnum'], inplace=True)
df_oot.drop(columns=['Date','Recnum'], inplace=True)
X = df_model.loc[:,df_model.columns!='Fraud']
y = df_model['Fraud']
X_oot = df_oot.loc[:,df_model.columns!='Fraud']
y_oot = df_oot['Fraud']

In [16]:
# Function to calculate FDR
def calculateFDR(predict, fraudscore, y, percent=0.03):
    temp = pd.DataFrame({'true':y.tolist(), 'predict':predict, 'score':fraudscore})
    temp.sort_values('score', ascending=False, inplace=True)
    count = int(temp.shape[0]*percent)
    return np.sum(temp.true[0:count])/np.sum(temp.true)

## Random Forest

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [18]:
def randomforestFDR(X, X_oot, y, y_oot, iteration=10, n_estimators=50, max_features=10, max_depth=20):
    rf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)
    
    fdr_df=[]
    fdr_total=[0,0,0]
    for i in range(iteration):
        X_train, X_test, y_train, y_test = train_test_split(X, y) # using default setting for test_size(0.25), random_state(None) and shuffle(True)
        rf.fit(X_train, y_train)
        
        data = [[X_train, y_train], [X_test, y_test], [X_oot, y_oot]]
        fdr_list=[]
        for d in range(len(data)):
            predict = rf.predict(data[d][0])
            prob = rf.predict_proba(data[d][0])
            fraudscore = prob.transpose()[1]
            fdr_each = calculateFDR(predict, fraudscore, data[d][1])
            fdr_total[d] += fdr_each
            fdr_list.append(fdr_each)
        
        fdr_df.append(fdr_list)
    
    avg_train = fdr_total[0] / iteration
    avg_test = fdr_total[1] / iteration
    avg_oot = fdr_total[2] / iteration
    result = [avg_train, avg_test, avg_oot]

    return fdr_df, result

In [19]:
# Adjusting hyperparameters
tree = [400] # towarddatascience guide: 200~2000
features = [10,15,20] # should be less than number of variables (original choice: [5,10,15,20])
depth = [10,50] # towarddatascience guide: 10~110 (original choice: [10,20,50,100])

maximum = 0
params=[]
fdr_all=[]
fdr_mean=[]
for i in tree:
    for j in features:
        for k in depth:
            temp1, temp2 = randomforestFDR(X, X_oot, y, y_oot, iteration=10, n_estimators=i, max_features=j, max_depth=k) # optimizing on test data
            params.append([i,j,k])
            fdr_mean.append(temp2)
            fdr_all.append(temp1)
            if maximum < temp2[1]:
                maximum=temp2[1]
                maxfdr=temp2
                maxparams=[i,j,k]
                
print(f'n_estimators: {maxparams[0]}, max_features: {maxparams[1]}, max_depth: {maxparams[2]} with fdr: {maximum} on the test data')

n_estimators: 400, max_features: 20, max_depth: 50 with fdr: 0.8770039259282214 on the test data


In [28]:
output30 = pd.DataFrame({'params':params, 'fdr':fdr_mean})
output30_alliter = pd.DataFrame(fdr_all[0],columns=[str(params[0])+': train', 'test', 'oot'])
for i in range(1,len(fdr_all)):
    output30_alliter = pd.concat([output30_alliter, pd.DataFrame(fdr_all[i],columns=[str(params[i])+': train', 'test', 'oot'])], axis=1)
output30.to_csv('output30.csv')
output30_alliter.to_csv('output30_alliter.csv')

In [21]:
var_ranking = pd.read_csv('var_ranking_Amy.csv')
var_ranking.head(30)

Unnamed: 0,ranking,variable
0,1,Cardnum_max_30
1,1,Cardnum_mean_0
2,1,Cardnum_total_1
3,1,Cardnum_total_14
4,1,Cardnum_total_30
5,1,Merchnum_total_1
6,1,amount/amount1Cardnum30
7,1,card_merch_max_3
8,1,card_merch_max_7
9,1,card_merch_median_3


In [22]:
# change number of variables to 27
var27 = var_ranking.head(27)['variable'].tolist() + ['Fraud']
df_model = df[(df.Date>'2010-01-14')&(df.Date<'2010-11-01')][var27].copy()
df_oot = df[df.Date>='2010-11-01'][var27].copy()
X = df_model.loc[:,df_model.columns!='Fraud']
y = df_model['Fraud']
X_oot = df_oot.loc[:,df_model.columns!='Fraud']
y_oot = df_oot['Fraud']

In [23]:
# Adjusting hyperparameters
tree = [400] # towarddatascience guide: 200~2000
features = [10,15,20] # should be less than number of variables (original choice: [5,10,15,20])
depth = [10,50] # towarddatascience guide: 10~110 (original choice: [10,20,50,100])

maximum = 0
params=[]
fdr_all=[]
fdr_mean=[]
for i in tree:
    for j in features:
        for k in depth:
            temp1, temp2 = randomforestFDR(X, X_oot, y, y_oot, iteration=10, n_estimators=i, max_features=j, max_depth=k) # optimizing on test data
            params.append([i,j,k])
            fdr_mean.append(temp2)
            fdr_all.append(temp1)
            if maximum < temp2[1]:
                maximum=temp2[1]
                maxfdr=temp2
                maxparams=[i,j,k]
                
print(f'n_estimators: {maxparams[0]}, max_features: {maxparams[1]}, max_depth: {maxparams[2]} with fdr: {maximum} on the test data')

n_estimators: 400, max_features: 10, max_depth: 50 with fdr: 0.860380125856787 on the test data


In [24]:
output27 = pd.DataFrame({'params':params, 'fdr':fdr_mean})
output27_alliter = pd.DataFrame(fdr_all[0],columns=[str(params[0])+': train', 'test', 'oot'])
for i in range(1,len(fdr_all)):
    output27_alliter = pd.concat([output27_alliter, pd.DataFrame(fdr_all[i],columns=[str(params[i])+': train', 'test', 'oot'])], axis=1)
output27.to_csv('output27.csv')
output27_alliter.to_csv('output27_alliter.csv')

In [27]:
fdr_mean

[[0.8568243952394253, 0.8265377470985582, 0.5681564245810055],
 [0.9949284449155729, 0.860380125856787, 0.5642458100558658],
 [0.8476963408674489, 0.8115422224363791, 0.553072625698324],
 [0.9963124669145677, 0.8550837336882331, 0.5418994413407822],
 [0.8405847438742657, 0.8128461884799887, 0.5452513966480448],
 [0.9964621509249463, 0.8500510988847398, 0.5234636871508381]]

In [29]:
end_time = pd.datetime.now()
print(end_time - start_time)

0:04:33.075687


  """Entry point for launching an IPython kernel.
