In [1]:
%load_ext autoreload
%autoreload 2

## Requirements
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/ieee-fraud-detection).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `category_encoders`, `selenium`, `geckodriver` and `scikit-learn` version 0.24 or higher.
* You also need to set up an AWS account and install `awscli` and `sagemaker-python-sdk`.

In [2]:
import bokeh
import os
import warnings
import boto3
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_png, export_svgs, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import Band, ColumnDataSource, HoverTool, NumeralTickFormatter
from bokeh.plotting import figure, show
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, 
                             average_precision_score, precision_recall_curve, roc_auc_score, roc_curve)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, IntegerParameter, HyperparameterTuner
from utils.measuring_performance import *
warnings.filterwarnings(action='ignore')
output_notebook()

In [3]:
def get_prediction(score, thr=0.5):
    return np.where(score >= thr, 1, 0)


def is_number(x):
    try:
        float(x)
        return 1
    except:
        return 0 
    

def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/Data/ieee-fraud-detection` in advance.

In [4]:
RAW_DATA_PATH = '../../data/ieee-fraud-detection'

In [5]:
train_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_identity.csv'))
train_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_transaction.csv'))
df_train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

In [6]:
cat_features = pd.Index(
    ['ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'DeviceInfo'] + [
        f'card{i}' for i in range(1, 7)] + [f'M{i}' for i in range(1, 10)] + [f'id_{i}' for i in range(12, 39)])
num_features = df_train.columns.difference(pd.Index(['TransactionID', 'TransactionDT', 'isFraud']) | cat_features)
all_features = cat_features | num_features

int_cat_features =  df_train[cat_features].select_dtypes('number').columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)
df_train[cat_features] = df_train[cat_features].astype('str')

# Data Splitting and Preprocessing

In [7]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.1, random_state=42, stratify=df_train['isFraud'])

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_X_train, df_y_train, test_size=0.15, random_state=42, stratify=df_y_train)

In [8]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='<unknown>'), 
                             OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
                             TargetEncoder(min_samples_leaf=1, smoothing=1.0))
num_pipeline = SimpleImputer(strategy='median')
processor = make_column_transformer((cat_pipeline, cat_features), (num_pipeline, num_features))

X_train = processor.fit_transform(df_X_train, df_y_train)
X_valid = processor.transform(df_X_valid)
X_test = processor.transform(df_X_test)

dtrain = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
dvalid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
dtest = X_test

In [9]:
proc_data_path = './proc_data'
dir_names = ['train', 'valid', 'test']
file_names = ['dtrain', 'dvalid', 'dtest']

for dir_name in dir_names:
    os.makedirs(os.path.join(proc_data_path, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(dir_names, file_names, [dtrain, dvalid, dtest]):
    np.savetxt(os.path.join(proc_data_path, dir_name, file_name) + '.csv', dataset, delimiter=',', fmt='%i')

# Hyperparameter Tuning
#### Uploading Datasets to S3 Bucket

In [10]:
sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = 'ieee-fraud-detection'

region = boto3.Session().region_name
role = 'AmazonSageMaker-ExecutionRole-20210114T163887' # sagemaker.get_execution_role()

In [11]:
%%time
s3_client = boto3.client('s3')

for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(proc_data_path, dir_name, file_name) + '.csv', BUCKET, BASE_JOB_PREFIX + '/' + dir_name + '/' + file_name + '.csv')

CPU times: user 4.82 s, sys: 4.5 s, total: 9.32 s
Wall time: 3min 11s


## Defining Built-in Algorithm XGBoost Estimator

In [12]:
image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-1',
    py_version='py3',
    instance_type='ml.m5.2xlarge'
)
model_output_uri = f's3://{BUCKET}/{BASE_JOB_PREFIX}/models'

estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)

scale_pos_weight = float(df_y_train.shape[0] / df_y_train.sum() - 1.0)
params = {
    'booster': 'gbtree',
    'verbosity': 0,
    'objective': 'binary:logistic',
    'scale_pos_weight': scale_pos_weight,
    'seed': 42,
    'eval_metric': 'auc',
    'num_round': 1000,
    'early_stopping_rounds': 10
}
estimator.set_hyperparameters(**params)

## Defining and Fitting HyperparameterTuner

In [13]:
hyperparameter_ranges = {
    'max_depth': IntegerParameter(1, 30, scaling_type='Auto'),
    'eta': ContinuousParameter(0.01, 1.0, scaling_type='Auto'),
    'gamma': ContinuousParameter(0.0, 1.0, scaling_type='Auto'),
    'min_child_weight': ContinuousParameter(1e-06, 1.0, scaling_type='Auto'),
    'subsample': ContinuousParameter(0.1, 1.0, scaling_type='Auto'),
    'colsample_bytree': ContinuousParameter(0.1, 1.0, scaling_type='Auto')
}

tuner = HyperparameterTuner(
    estimator,
    'validation:auc',
    hyperparameter_ranges,
    objective_type='Maximize',
    max_jobs=30,
    max_parallel_jobs=3,
    base_tuning_job_name=f'{BASE_JOB_PREFIX}-xgb-hpo',
    early_stopping_type='Auto'
)

In [14]:
%%time
train_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/train/', 
    content_type='text/csv'
)
valid_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/valid/', 
    content_type='text/csv'
)

tuner.fit(
    {
        'train': train_input, 
        'validation': valid_input
    }
)

best_estimator = tuner.best_estimator()
best_params = best_estimator.hyperparameters()
tuning_job_name = tuner.latest_tuning_job.name

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Analyzing Hyperparameter Tuning Results

In [15]:
tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
df_viz = tuning_job_analytics.dataframe()

image_path = './img'
os.makedirs(image_path, exist_ok=True)

In [16]:
df_viz.sort_values('FinalObjectiveValue', ascending=False)[:10]

Unnamed: 0,colsample_bytree,eta,gamma,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
8,0.183316,0.033077,0.221075,22.0,0.052186,0.928623,ieee-fraud-detection-210527-1511-022-59bd86f7,Completed,0.97055,2021-05-27 18:23:13+09:00,2021-05-27 18:54:55+09:00,1902.0
12,0.311646,0.014117,0.180659,23.0,0.019902,0.85235,ieee-fraud-detection-210527-1511-018-e7f1d290,Completed,0.97025,2021-05-27 17:02:29+09:00,2021-05-27 17:54:48+09:00,3139.0
0,0.671915,0.068256,0.689642,29.0,3.4e-05,0.956271,ieee-fraud-detection-210527-1511-030-946280ee,Completed,0.96931,2021-05-27 19:16:55+09:00,2021-05-27 19:49:09+09:00,1934.0
14,0.908726,0.01049,0.958521,30.0,2e-06,0.816013,ieee-fraud-detection-210527-1511-016-7108eb5c,Completed,0.96921,2021-05-27 16:11:25+09:00,2021-05-27 18:39:15+09:00,8870.0
3,0.739224,0.015058,0.455938,26.0,0.000336,1.0,ieee-fraud-detection-210527-1511-027-8b1f9ad9,Completed,0.96914,2021-05-27 18:57:52+09:00,2021-05-27 20:41:12+09:00,6200.0
17,0.253656,0.080459,0.781733,26.0,6.2e-05,1.0,ieee-fraud-detection-210527-1511-013-f891cf48,Completed,0.96896,2021-05-27 15:49:49+09:00,2021-05-27 16:02:00+09:00,731.0
4,0.267526,0.13605,0.48495,25.0,0.007164,0.850675,ieee-fraud-detection-210527-1511-026-590ec93a,Completed,0.96775,2021-05-27 18:47:41+09:00,2021-05-27 19:00:38+09:00,777.0
13,0.934571,0.030495,0.879614,30.0,7.1e-05,0.995392,ieee-fraud-detection-210527-1511-017-8b849779,Completed,0.96706,2021-05-27 16:11:32+09:00,2021-05-27 18:09:39+09:00,7087.0
16,0.181833,0.171879,0.474696,30.0,0.001084,0.960098,ieee-fraud-detection-210527-1511-014-82dbb1c8,Completed,0.96666,2021-05-27 15:58:42+09:00,2021-05-27 16:08:24+09:00,582.0
18,0.916571,0.033437,0.866355,30.0,7.1e-05,0.995392,ieee-fraud-detection-210527-1511-012-6c787ae2,Completed,0.96602,2021-05-27 15:42:38+09:00,2021-05-27 16:55:08+09:00,4350.0


In [17]:
class HoverHelper():
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ('TrainingJobName', '@TrainingJobName'),
            ('FinalObjectiveValue', '@FinalObjectiveValue')
        ]
    
        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, '@{%s}' % key) )

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(self, standard_tools='pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset'):
        return [self.hovertool(), standard_tools]
    
    
def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        elif (i % n_cols == n_cols - 1) or (i == len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [18]:
hover_helper = HoverHelper(tuning_job_analytics)

p = figure(plot_width=800, plot_height=400, tools=hover_helper.tools(), 
           title='Convergence Plot', x_axis_type='datetime', x_axis_label='Training Start Time', y_axis_label='AUROC')
_ = p.line(x='TrainingStartTime', y='FinalObjectiveValue', color='coral', line_width=1.5, source=df_viz)
_ = p.circle(x='TrainingStartTime', y='FinalObjectiveValue', line_color='coral', line_width=1.5, fill_color='white', source=df_viz)

p.xgrid.grid_line_color = None
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')
p.title.align = 'center'
p.title.text_font_size = '12pt'

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename=f'{image_path}/convergence_plot.svg')

In [19]:
df_viz = df_viz.reset_index()
df_viz['index'] = (df_viz['index'] + df_viz['index'].min()) / (df_viz['index'].max() - df_viz['index'].min())

figures = []
for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
    categorical_args = dict()
    if param_range.get('Values'):          
        values = param_range['Values']
        if sum([is_number(x) for x in values]) == len(values):
            print('Hyperparameter %s is tuned as categorical, but all values are numeric.' % param_name)
        else:
            categorical_args['x_range'] = values

    plot = figure(plot_width=400, plot_height=400, tools=hover_helper.tools(), 
                  x_axis_label=param_name, y_axis_label='AUROC', **categorical_args)
    plot.circle(source=df_viz, x=param_name, y='FinalObjectiveValue', color='black', alpha='index')
    plot.xgrid.grid_line_color = None
    plot.yaxis.formatter = NumeralTickFormatter(format='0.0%')
    figures.append(plot)

p = gridplot(make_grid(figures, 3), toolbar_location='right')

show(p)

_ = export_png(p, filename=f'{image_path}/partial_dependence_plot.png')

# Model Evaluation
## Defining Transformer and Prediction

In [20]:
%%time
transformer = best_estimator.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{BUCKET}/{BASE_JOB_PREFIX}/pred'
)

_ = transformer.transform(
    data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/test/',
    content_type='text/csv', 
    split_type='Line'
)

.........................[34m[2021-05-27:11:49:31:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-27:11:49:31:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-27:11:49:31:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    loca

#### Downloading Prediction Scores to Local Directory

In [21]:
s3_client.download_file(BUCKET, f'{BASE_JOB_PREFIX}/pred/dtest.csv.out', os.path.join(proc_data_path, 'test', 'dtest.csv.out'))
scores = pd.read_csv(os.path.join(proc_data_path, 'test', 'dtest.csv.out'), header=None).values

## Measuring Predictive Performance

In [22]:
predictions = get_prediction(scores)

plot_confusion_matrix(
    confusion_matrix(df_y_test, predictions), 
    file_name=os.path.join(image_path, 'conf_mat.svg'))

print('ACCURACY: {0:.2%}, PRECISION: {1:.2%}, RECALL: {2:.2%}, F1: {3:.2%}'.format(
    accuracy_score(df_y_test, predictions), precision_score(df_y_test, predictions), recall_score(df_y_test, predictions), f1_score(df_y_test, predictions)))

ACCURACY: 98.62%, PRECISION: 92.58%, RECALL: 65.83%, F1: 76.94%


In [23]:
plot_roc_curve(
    roc_curve(df_y_test, scores), roc_auc_score(df_y_test, scores), 
    file_name=os.path.join(image_path, 'roc_curve.svg'))

In [24]:
plot_pr_curve(
    precision_recall_curve(df_y_test, scores), average_precision_score(df_y_test, scores), 
    file_name=os.path.join(image_path, 'pr_curve.svg'))

# Model Retraining
### Data Loading, Splitting, Preprocessing and Uploading

In [25]:
test_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_identity.csv')) 
test_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_transaction.csv'))
df_test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
df_test = df_test.rename(columns={'id-{:02d}'.format(i): 'id_{:02d}'.format(i) for i in range(1, 39)})

df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)
df_test[cat_features] = df_test[cat_features].astype('str')

In [26]:
df_X_retrain, df_X_revalid, df_y_retrain, df_y_revalid = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.15, random_state=42, stratify=df_train['isFraud'])

X_retrain = processor.fit_transform(df_X_retrain, df_y_retrain)
X_revalid = processor.transform(df_X_revalid)
X_retest = processor.transform(df_test[all_features])

dtrain = np.concatenate((df_y_retrain.values.reshape(-1, 1), X_retrain), axis=1)
dvalid = np.concatenate((df_y_revalid.values.reshape(-1, 1), X_revalid), axis=1)
dtest = X_retest

In [27]:
dir_names = ['retrain', 'revalid', 'retest']

for dir_name in dir_names:
    os.makedirs(os.path.join(proc_data_path, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(dir_names, file_names, [dtrain, dvalid, dtest]):
    np.savetxt(os.path.join(proc_data_path, dir_name, file_name) + '.csv', dataset, delimiter=',', fmt='%i')

In [28]:
%%time
for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(proc_data_path, dir_name, file_name) + '.csv', BUCKET, BASE_JOB_PREFIX + '/' + dir_name + '/' + file_name + '.csv')

CPU times: user 9.42 s, sys: 9.26 s, total: 18.7 s
Wall time: 5min 1s


## Defining and Fitting Estimator

In [29]:
full_estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)

best_params = best_estimator.hyperparameters()
_ = best_params.pop('_tuning_objective_metric')
params.update(best_params)
full_estimator.set_hyperparameters(**params)

In [30]:
%%time
retrain_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/retrain/', 
    content_type='text/csv'
)
revalid_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/revalid/', 
    content_type='text/csv'
)

full_estimator.fit(
    {
    'train': retrain_input,
    'validation': revalid_input
    }
)

2021-05-27 11:58:46 Starting - Starting the training job...
2021-05-27 11:59:09 Starting - Launching requested ML instancesProfilerReport-1622116724: InProgress
......
2021-05-27 12:00:10 Starting - Preparing the instances for training...
2021-05-27 12:00:50 Downloading - Downloading input data...
2021-05-27 12:01:30 Training - Downloading the training image...
2021-05-27 12:01:52 Training - Training image download completed. Training in progress.[34m[2021-05-27 12:01:54.760 ip-10-2-114-208.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value gbtree to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective va

## Defining Transformer and Prediction

In [31]:
%%time
full_transformer = full_estimator.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{BUCKET}/{BASE_JOB_PREFIX}/repred'
)

_ = full_transformer.transform(
    data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/retest/',
    content_type='text/csv', 
    split_type='Line'
)

.........................[34m[2021-05-27:12:32:49:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-27:12:32:49:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-27:12:32:49:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m[2021-05-27:12:32:49:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2021-05-27:12:32:49:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2021-05-27:12:32:49:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;
[0m
[35mworker_rlimit_nofile 4096;
[0m
[35mevents {
  worker_connections 2048;[0m
[35m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;


#### Downloading Prediction Scores to Local Directory

In [32]:
s3_client.download_file(BUCKET, f'{BASE_JOB_PREFIX}/repred/dtest.csv.out', os.path.join(proc_data_path, 'retest', 'dtest.csv.out'))
scores = pd.read_csv(os.path.join(proc_data_path, 'retest', 'dtest.csv.out'), header=None).values

In [33]:
submission = pd.DataFrame({'TransactionID': df_test['TransactionID'].values, 'isFraud': scores.flatten()})
submission.to_csv('./submission.csv', index=False)