In [1]:
%load_ext autoreload
%autoreload 2

## Requirement
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/ieee-fraud-detection).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `category_encoders`, `selenium`, `geckodriver` and `scikit-learn` version 0.24 or higher.
* You also need to set up an AWS account and install `awscli` and `sagemaker-python-sdk`.

In [2]:
import bokeh
import os
import warnings
import boto3
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_png, export_svgs
from bokeh.layouts import gridplot
from bokeh.models import Band, ColumnDataSource, HoverTool, NumeralTickFormatter
from bokeh.plotting import figure, show
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, 
                             average_precision_score, precision_recall_curve, roc_auc_score, roc_curve)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, IntegerParameter, HyperparameterTuner
warnings.filterwarnings(action='ignore')
bokeh.io.output_notebook()

In [None]:
def get_pred(score, thr=0.5):
    return np.where(score >= thr, 1, 0)


def is_number(x):
    try:
        float(x)
        return 1
    except:
        return 0 


def make_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)
    

def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/Data/ieee-fraud-detection` in advance.

In [None]:
RAW_DATA_PATH = '../../Data/ieee-fraud-detection'

In [None]:
train_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_identity.csv'))
train_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_transaction.csv'))
df_train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

In [None]:
cat_features = pd.Index(
    ['ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'DeviceInfo'] + [
        f'card{i}' for i in range(1, 7)] + [f'M{i}' for i in range(1, 10)] + [f'id_{i}' for i in range(12, 39)])
num_features = df_train.columns.difference(pd.Index(['TransactionID', 'TransactionDT', 'isFraud']) | cat_features)
all_features = cat_features | num_features

int_cat_features =  df_train[cat_features].select_dtypes('number').columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)
df_train[cat_features] = df_train[cat_features].astype('str')

# Data Splitting and Preprocessing

In [None]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.1, random_state=42, stratify=df_train['isFraud'])

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_X_train, df_y_train, test_size=0.15, random_state=42, stratify=df_y_train)

In [None]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='<unknown>'), 
                             OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
                             TargetEncoder(min_samples_leaf=1, smoothing=1.0))
num_pipeline = SimpleImputer(strategy='median')
processor = make_column_transformer((cat_pipeline, cat_features), (num_pipeline, num_features))

X_train = processor.fit_transform(df_X_train, df_y_train)
X_valid = processor.transform(df_X_valid)
X_test = processor.transform(df_X_test)

dtrain = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
dvalid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
dtest = X_test

In [None]:
prc_data_path = './prc_data'
dir_names = ['train', 'valid', 'test']
file_names = ['dtrain', 'dvalid', 'dtest']

for dir_name in dir_names:
    make_dirs(os.path.join(prc_data_path, dir_name))

for dir_name, file_name, dataset in zip(dir_names, file_names, [dtrain, dvalid, dtest]):
    np.savetxt(os.path.join(prc_data_path, dir_name, file_name) + '.csv', dataset, delimiter=',', fmt='%i')

# Hyperparameter Tuning
#### Uploading Datasets to S3 Bucket

In [None]:
sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = 'ieee-fraud-detection'

region = boto3.Session().region_name
role = sagemaker.get_execution_role()

In [None]:
%%time
s3_client = boto3.client('s3')

for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(prc_data_path, dir_name, file_name) + '.csv', BUCKET, BASE_JOB_PREFIX + '/' + dir_name + '/' + file_name + '.csv')

## Defining Built-in Algorithm XGBoost Estimator

In [None]:
image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-1',
    py_version='py3',
    instance_type='ml.m5.2xlarge'
)
model_output_uri = f's3://{BUCKET}/{BASE_JOB_PREFIX}/models'

estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)

scale_pos_weight = float(df_y_train.shape[0] / df_y_train.sum() - 1.0)
params = {
    'booster': 'gbtree',
    'verbosity': 0
    'objective': 'binary:logistic',
    'scale_pos_weight': scale_pos_weight,
    'seed': 42,
    'eval_metric': 'auc',
    'num_round': 1000,
    'early_stopping_rounds': 10
}
estimator.set_hyperparameters(**params)

## Defining and Fitting HyperparameterTuner

In [4]:
hyperparameter_ranges = {
    'max_depth': IntegerParameter(1, 30, scailing_type='Auto'),
    'eta': ContinuousParameter(0.01, 1.0, scailing_type='Auto'),
    'gamma': ContinuousParameter(0.0, 1.0, scailing_type='Auto'),
    'min_child_weight': ContinuousParameter(1e-06, 1.0, scailing_type='Auto'),
    'subsample': ContinuousParameter(0.1, 1.0, scailing_type='Auto'),
    'colsample_bytree': ContinuousParameter(0.1, 1.0, scailing_type='Auto')
}

tuner = HyperparameterTuner(
    estimator,
    'validation:auc',
    hyperparameter_ranges,
    objective_type='Maximize',
    max_jobs=30,
    max_parallel_jobs=3,
    base_tuning_job_name=f'{BASE_JOB_PREFIX}-xgb-hpo',
    early_stopping_type='Auto'
)

[autoreload of numpy.core.overrides failed: Traceback (most recent call last):
  File "/opt/anaconda3/envs/sagemaker-pipelines/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/opt/anaconda3/envs/sagemaker-pipelines/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/opt/anaconda3/envs/sagemaker-pipelines/lib/python3.8/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/opt/anaconda3/envs/sagemaker-pipelines/lib/python3.8/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/opt/anaconda3/envs/sagemaker-pipelines/lib/python3.8/site-packages/numpy/core/overrides.py", 

TypeError: __init__() got an unexpected keyword argument 'scailing_type'

In [None]:
%%time
train_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/train/', 
    content_type='text/csv'
)
valid_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/valid/', 
    content_type='text/csv'
)

tuner.fit(
    {
        'train': train_input, 
        'validation': valid_input
    }
)

best_estimator = tuner.best_estimator()
best_params = best_estimator.hyperparameters()
tuning_job_name = tuner.latest_tuning_job.name

## Analyzing Hyperparameter Tuning Results

In [None]:
tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
df_viz = tuning_job_analytics.dataframe()

image_path = './img'
make_dirs(image_path)

In [None]:
df_viz.sort_values('FinalObjectiveValue', ascending=False)[:10]

In [None]:
class HoverHelper():
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ('TrainingJobName', '@TrainingJobName'),
            ('FinalObjectiveValue', '@FinalObjectiveValue')
        ]
    
        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, '@{%s}' % key) )

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(self, standard_tools='pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset'):
        return [self.hovertool(), standard_tools]
    
    
def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        elif (i % n_cols == n_cols - 1) or (i == len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [None]:
hover_helper = HoverHelper(tuning_job_analytics)

p = figure(plot_width=800, plot_height=400, tools=hover_helper.tools(), 
           title='Convergence Plot', x_axis_type='datetime', x_axis_label='Training Start Time', y_axis_label='AUROC')
_ = p.line(source=df_viz, x='TrainingStartTime', y='FinalObjectiveValue', color='coral', line_width=1.5)
_ = p.circle(source=df_viz, x='TrainingStartTime', y='FinalObjectiveValue', line_color='coral', line_width=1.5, fill_color='white')

p.title.align = 'center'
p.title.text_font_size = '11pt'
p.xgrid.grid_line_color = None
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename=f'{image_path}/convergence_plot.svg')

In [None]:
df_viz = df_viz.reset_index()
df_viz['index'] = (df_viz['index'] + df_viz['index'].min()) / (df_viz['index'].max() - df_viz['index'].min())

figures = []
for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
    categorical_args = dict()
    if param_range.get('Values'):          
        values = param_range['Values']
        if sum([is_number(x) for x in values]) == len(values):
            print('Hyperparameter %s is tuned as categorical, but all values are numeric.' % param_name)
        else:
            categorical_args['x_range'] = values

    plot = figure(plot_width=400, plot_height=400, tools=hover_helper.tools(), 
                  x_axis_label=param_name, y_axis_label='AUROC', **categorical_args)
    plot.circle(source=df_viz, x=param_name, y='FinalObjectiveValue', color='black', alpha='index')
    plot.xgrid.grid_line_color = None
    plot.yaxis.formatter = NumeralTickFormatter(format='0.0%')
    figures.append(plot)

p = gridplot(make_grid(figures, 3), toolbar_location='right')

show(p)

_ = export_png(p, filename=f'{image_path}/partial_dependence_plot.png')

# Model Evaluation
## Defining Transformer and Prediction

In [None]:
%%time
transformer = best_estimator.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{BUCKET}/{BASE_JOB_PREFIX}/pred'
)

_ = transformer.transform(
    data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/test/',
    content_type='text/csv', 
    split_type='Line'
)

#### Downloading Prediction Scores to Local Directory

In [None]:
s3_client.download_file(BUCKET, f'{BASE_JOB_PREFIX}/pred/dtest.csv.out', os.path.join(prc_data_path, 'test', 'dtest.csv.out'))
scores = pd.read_csv(os.path.join(prc_data_path, 'test', 'dtest.csv.out'), header=None).values

## Measuring Predictive Performance

In [None]:
preds = get_pred(scores)

print('ACCURACY: {0:.2%}, PRECISION: {1:.2%}, RECALL: {2:.2%}, F1: {3:.2%}'.format(
    accuracy_score(df_y_test, preds), precision_score(df_y_test, preds), recall_score(df_y_test, preds), f1_score(df_y_test, preds)))

In [None]:
source = pd.DataFrame(roc_curve(df_y_test, scores), index=['fpr', 'tpr', 'thr']).T

p = figure(plot_height=400, title='ROC Curve (AUROC {:.2%})'.format(roc_auc_score(df_y_test, scores)), 
           x_axis_label='False Positive Rate', y_axis_label='True Positive Rate')
_ = p.line(source=source, x='fpr', y='tpr', color='coral', line_width=1.5)
_ = p.line(source=source, x='fpr', y='fpr', color='black', line_dash='dashed')

p.title.align = 'center'
p.title.text_font_size = '11pt'
p.xgrid.grid_line_color = None
p.xaxis.formatter = NumeralTickFormatter(format='0.0%')
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename=f'{image_path}/roc_curve.svg')

In [None]:
source = pd.DataFrame(precision_recall_curve(df_y_test, scores), index=['recall', 'precision', 'thr']).T

p = figure(plot_height=400, title='Precision - Recall Curve (AUPRC {:.2%})'.format(average_precision_score(df_y_test, scores)), 
           x_axis_label='Recall', y_axis_label='Precision')
_ = p.line(source=source, x='recall', y='precision', color='coral', line_width=1.0)
band = Band(source=ColumnDataSource(data=dict(recall=source['recall'], precision=source['precision'])), 
            base='recall', upper='precision', level='underlay', fill_color='coral', fill_alpha=0.2)
p.add_layout(band)

p.title.align = 'center'
p.title.text_font_size = '11pt'
p.xgrid.grid_line_color = None
p.xaxis.formatter = NumeralTickFormatter(format='0.0%')
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename=f'{image_path}/pr_curve.svg')

# Model Retraining
### Data Loading, Splitting, Preprocessing and Uploading

In [None]:
test_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_identity.csv')) 
test_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_transaction.csv'))
df_test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
df_test = df_test.rename(columns={'id-{:02d}'.format(i): 'id_{:02d}'.format(i) for i in range(1, 39)})

df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)
df_test[cat_features] = df_test[cat_features].astype('str')

In [None]:
df_X_retrain, df_X_revalid, df_y_retrain, df_y_revalid = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.15, random_state=42, stratify=df_train['isFraud'])

X_retrain = processor.fit_transform(df_X_retrain, df_y_retrain)
X_revalid = processor.transform(df_X_revalid)
X_retest = processor.transform(df_test[all_features])

dtrain = np.concatenate((df_y_retrain.values.reshape(-1, 1), X_retrain), axis=1)
dvalid = np.concatenate((df_y_revalid.values.reshape(-1, 1), X_revalid), axis=1)
dtest = X_retest

In [None]:
dir_names = ['retrain', 'revalid', 'retest']

for dir_name in dir_names:
    make_dirs(os.path.join(prc_data_path, dir_name))

for dir_name, file_name, dataset in zip(dir_names, file_names, [dtrain, dvalid, dtest]):
    np.savetxt(os.path.join(prc_data_path, dir_name, file_name) + '.csv', dataset, delimiter=',', fmt='%i')

In [None]:
%%time
for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(prc_data_path, dir_name, file_name) + '.csv', BUCKET, BASE_JOB_PREFIX + '/' + dir_name + '/' + file_name + '.csv')

## Defining and Fitting Estimator

In [None]:
full_estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)

best_params = best_estimator.hyperparameters()
_ = best_params.pop('_tuning_objective_metric')
params.update(best_params)
full_estimator.set_hyperparameters(**params)

In [None]:
%%time
retrain_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/retrain/', 
    content_type='text/csv'
)
revalid_input = TrainingInput(
    s3_data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/revalid/', 
    content_type='text/csv'
)

full_estimator.fit(
    {
    'train': retrain_input,
    'validation': revalid_input
    }
)

## Defining Transformer and Prediction

In [None]:
%%time
full_transformer = full_estimator.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{BUCKET}/{BASE_JOB_PREFIX}/repred'
)

_ = full_transformer.transform(
    data=f's3://{BUCKET}/{BASE_JOB_PREFIX}/retest/',
    content_type='text/csv', 
    split_type='Line'
)

#### Downloading Prediction Scores to Local Directory

In [None]:
s3_client.download_file(BUCKET, f'{BASE_JOB_PREFIX}/repred/dtest.csv.out', os.path.join(prc_data_path, 'retest', 'dtest.csv.out'))
scores = pd.read_csv(os.path.join(prc_data_path, 'retest', 'dtest.csv.out'), header=None).values

In [None]:
submission = pd.DataFrame({'TransactionID': df_test['TransactionID'].values, 'isFraud': scores.flatten()})
submission.to_csv('./submission.csv', index=False)