In [1]:
%load_ext autoreload
%autoreload 2

## Requirements
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/ieee-fraud-detection).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `category_encoders`, `selenium`, `geckodriver` and `scikit-learn` version 0.24 or higher.
* You also need to set up an AWS account and install `awscli` and `sagemaker-python-sdk`.

In [2]:
import os
import warnings
import bokeh
import boto3
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_png, export_svgs, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import Band, ColumnDataSource, HoverTool, NumeralTickFormatter
from bokeh.plotting import figure, show
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    CategoricalParameter,
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
)
from utils.measuring_performance import (
    get_prediction,
    plot_confusion_matrix,
    plot_pr_curve,
    plot_roc_curve,
)

warnings.filterwarnings(action="ignore")
output_notebook()

In [3]:
def is_number(x):
    try:
        float(x)
        return 1
    except ValueError:
        return 0


def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/Data/ieee-fraud-detection` in advance.

In [4]:
RAW_DATA_PATH = "../../data/ieee-fraud-detection"

In [5]:
train_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, "train_identity.csv"))
train_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, "train_transaction.csv"))
df_train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

In [6]:
cat_features = pd.Index(
    [
        "ProductCD",
        "addr1",
        "addr2",
        "P_emaildomain",
        "R_emaildomain",
        "DeviceType",
        "DeviceInfo",
    ]
    + [f"card{i}" for i in range(1, 7)]
    + [f"M{i}" for i in range(1, 10)]
    + [f"id_{i}" for i in range(12, 39)]
)
num_features = df_train.columns.difference(
    pd.Index(["TransactionID", "TransactionDT", "isFraud"]) | cat_features
)
all_features = cat_features | num_features

int_cat_features = df_train[cat_features].select_dtypes("number").columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)
df_train[cat_features] = df_train[cat_features].astype("str")

# Data Splitting and Preprocessing

In [7]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=0.1,
    random_state=42,
    stratify=df_train["isFraud"],
)

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_X_train, df_y_train, test_size=0.15, random_state=42, stratify=df_y_train
)

In [8]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="<unknown>"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    TargetEncoder(min_samples_leaf=1, smoothing=1.0),
)
num_pipeline = SimpleImputer(strategy="median")
processor = make_column_transformer(
    (cat_pipeline, cat_features), (num_pipeline, num_features)
)

X_train = processor.fit_transform(df_X_train, df_y_train)
X_valid = processor.transform(df_X_valid)
X_test = processor.transform(df_X_test)

arr_train = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
arr_valid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
arr_test = X_test

In [9]:
proc_data_path = "./proc_data"
dir_names = ["train", "valid", "test"]
file_names = ["arr_train", "arr_valid", "arr_test"]

for dir_name in dir_names:
    os.makedirs(os.path.join(proc_data_path, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(dir_names, file_names, [arr_train, arr_valid, arr_test]):
    np.savetxt(
        os.path.join(proc_data_path, dir_name, file_name) + ".csv",
        dataset,
        delimiter=",",
        fmt="%i",
    )

# Hyperparameter Tuning
#### Uploading Datasets to S3 Bucket

In [10]:
sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = "ieee-fraud-detection"

region = boto3.Session().region_name
role = sagemaker.get_execution_role()

In [11]:
%%time
s3_client = boto3.client("s3")

for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(proc_data_path, dir_name, file_name) + ".csv",
        BUCKET,
        BASE_JOB_PREFIX + "/" + dir_name + "/" + file_name + ".csv",
    )

CPU times: user 3.59 s, sys: 2.64 s, total: 6.22 s
Wall time: 6min 46s


## Defining Built-in Algorithm XGBoost Estimator

In [12]:
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.2-1",
    py_version="py3",
    instance_type="ml.m5.2xlarge",
)
model_output_uri = f"s3://{BUCKET}/{BASE_JOB_PREFIX}/models"

estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)

scale_pos_weight = float(df_y_train.shape[0] / df_y_train.sum() - 1.0)
params = {
    "booster": "gbtree",
    "verbosity": 0,
    "objective": "binary:logistic",
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
    "eval_metric": "auc",
    "num_round": 1000,
    "early_stopping_rounds": 10,
}
estimator.set_hyperparameters(**params)

## Defining and Fitting HyperparameterTuner

In [13]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(1, 30, scaling_type="Auto"),
    "eta": ContinuousParameter(0.01, 1.0, scaling_type="Auto"),
    "gamma": ContinuousParameter(0.0, 1.0, scaling_type="Auto"),
    "min_child_weight": ContinuousParameter(1e-06, 1.0, scaling_type="Auto"),
    "subsample": ContinuousParameter(0.1, 1.0, scaling_type="Auto"),
    "colsample_bytree": ContinuousParameter(0.1, 1.0, scaling_type="Auto"),
}

tuner = HyperparameterTuner(
    estimator,
    "validation:auc",
    hyperparameter_ranges,
    objective_type="Maximize",
    max_jobs=30,
    max_parallel_jobs=3,
    base_tuning_job_name=f"{BASE_JOB_PREFIX}-xgb-hpo",
    early_stopping_type="Auto",
)

In [14]:
%%time
train_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/train/", content_type="text/csv"
)
valid_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/valid/", content_type="text/csv"
)

tuner.fit({"train": train_input, "validation": valid_input})

best_estimator = tuner.best_estimator()
best_params = best_estimator.hyperparameters()
tuning_job_name = tuner.latest_tuning_job.name

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Analyzing Hyperparameter Tuning Results

In [15]:
tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
df_viz = tuning_job_analytics.dataframe()

image_path = "./img"
os.makedirs(image_path, exist_ok=True)

In [16]:
df_viz.sort_values("FinalObjectiveValue", ascending=False)[:10]

Unnamed: 0,colsample_bytree,eta,gamma,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
27,0.510828,0.025352,0.454371,20.0,0.05943,0.936084,ieee-fraud-detection-210606-1021-003-0a151ed4,Completed,0.9713,2021-06-06 10:24:23+09:00,2021-06-06 11:09:14+09:00,2691.0
0,0.646409,0.074753,0.031868,24.0,0.073002,0.853271,ieee-fraud-detection-210606-1021-030-6955a5db,Completed,0.97129,2021-06-06 13:34:28+09:00,2021-06-06 13:58:21+09:00,1433.0
12,0.552531,0.039479,0.510708,20.0,0.015405,0.970752,ieee-fraud-detection-210606-1021-018-61a69865,Completed,0.97108,2021-06-06 12:01:11+09:00,2021-06-06 12:42:51+09:00,2500.0
16,0.664811,0.015201,0.364448,27.0,0.20893,0.895661,ieee-fraud-detection-210606-1021-014-d01f2ca8,Completed,0.97082,2021-06-06 11:33:22+09:00,2021-06-06 13:05:06+09:00,5504.0
2,0.457142,0.034882,0.169552,30.0,0.000651,0.905411,ieee-fraud-detection-210606-1021-028-ea2cdd21,Completed,0.97068,2021-06-06 13:25:38+09:00,2021-06-06 14:05:48+09:00,2410.0
20,0.307865,0.044103,0.174385,30.0,0.350633,0.941418,ieee-fraud-detection-210606-1021-010-ae7d6d89,Completed,0.97061,2021-06-06 11:05:29+09:00,2021-06-06 11:30:49+09:00,1520.0
6,0.378469,0.054898,0.262703,21.0,0.465471,0.903021,ieee-fraud-detection-210606-1021-024-104d3880,Completed,0.97056,2021-06-06 12:55:47+09:00,2021-06-06 13:13:36+09:00,1069.0
1,0.638645,0.074753,0.011868,24.0,0.073002,0.853271,ieee-fraud-detection-210606-1021-029-1f4922cb,Completed,0.97052,2021-06-06 13:30:13+09:00,2021-06-06 13:52:48+09:00,1355.0
5,0.410165,0.038688,0.381782,30.0,0.775933,0.863543,ieee-fraud-detection-210606-1021-025-1c7a8831,Completed,0.97015,2021-06-06 13:01:30+09:00,2021-06-06 13:31:40+09:00,1810.0
15,0.999976,0.142617,0.568092,20.0,0.256983,0.848831,ieee-fraud-detection-210606-1021-015-94cf11eb,Completed,0.9685,2021-06-06 11:33:51+09:00,2021-06-06 11:51:16+09:00,1045.0


In [17]:
class HoverHelper:
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ("TrainingJobName", "@TrainingJobName"),
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
        ]

        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, f"@{key}"))

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(
        self,
        standard_tools="pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset",
    ):
        return [self.hovertool(), standard_tools]


def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        elif (i % n_cols == n_cols - 1) or (i == len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [18]:
hover_helper = HoverHelper(tuning_job_analytics)

p = figure(
    plot_width=800,
    plot_height=400,
    tools=hover_helper.tools(),
    title="Convergence Plot",
    x_axis_type="datetime",
    x_axis_label="Training Start Time",
    y_axis_label="AUROC",
)
_ = p.line(
    x="TrainingStartTime",
    y="FinalObjectiveValue",
    color="coral",
    line_width=1.5,
    source=df_viz,
)
_ = p.circle(
    x="TrainingStartTime",
    y="FinalObjectiveValue",
    line_color="coral",
    line_width=1.5,
    fill_color="white",
    source=df_viz,
)

p.xgrid.grid_line_color = None
p.yaxis.formatter = NumeralTickFormatter(format="0.0%")
p.title.align = "center"
p.title.text_font_size = "12pt"

show(p)

p.output_backend = "svg"
_ = export_svgs(p, filename=f"{image_path}/convergence_plot.svg")

In [19]:
df_viz = df_viz.reset_index()
df_viz["index"] = (df_viz["index"] + df_viz["index"].min()) / (
    df_viz["index"].max() - df_viz["index"].min()
)

figures = []
for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
    categorical_args = dict()
    if param_range.get("Values"):
        values = param_range["Values"]
        if sum([is_number(x) for x in values]) == len(values):
            print(
                f"Hyperparameter {param_name} is tuned as categorical, but all values are numeric."
            )
        else:
            categorical_args["x_range"] = values

    p = figure(
        plot_width=400,
        plot_height=400,
        tools=hover_helper.tools(),
        x_axis_label=param_name,
        y_axis_label="AUROC",
        **categorical_args,
    )
    p.circle(
        source=df_viz,
        x=param_name,
        y="FinalObjectiveValue",
        color="black",
        alpha="index",
    )
    p.xgrid.grid_line_color = None
    p.yaxis.formatter = NumeralTickFormatter(format="0.0%")
    figures.append(p)

grid_plot = gridplot(make_grid(figures, 3), toolbar_location="right")

show(grid_plot)

_ = export_png(grid_plot, filename=f"{image_path}/partial_dependence_plot.png")

# Model Evaluation
## Defining Transformer and Prediction

In [20]:
%%time
transformer = best_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/pred",
)

_ = transformer.transform(
    data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/test/",
    content_type="text/csv",
    split_type="Line",
)

...........................[34m[2021-06-06:05:13:05:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-06-06:05:13:05:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-06-06:05:13:05:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    lo

#### Downloading Prediction Scores to Local Directory

In [21]:
s3_client.download_file(
    BUCKET,
    f"{BASE_JOB_PREFIX}/pred/arr_test.csv.out",
    os.path.join(proc_data_path, "test", "arr_test.csv.out"),
)
scores = pd.read_csv(
    os.path.join(proc_data_path, "test", "arr_test.csv.out"), header=None
).values

## Measuring Predictive Performance

In [22]:
predictions = get_prediction(scores)

plot_confusion_matrix(
    confusion_matrix(df_y_test, predictions),
    file_name=os.path.join(image_path, "conf_mat.svg"),
)

print(
    f"ACCURACY: {accuracy_score(df_y_test, predictions):.2%}, \
    PRECISION: {precision_score(df_y_test, predictions):.2%}, \
    RECALL: {recall_score(df_y_test, predictions):.2%}, \
    F1: {f1_score(df_y_test, predictions):.2%}"
)

ACCURACY: 98.68%,     PRECISION: 91.48%,     RECALL: 68.64%,     F1: 78.43%


In [23]:
plot_roc_curve(
    roc_curve(df_y_test, scores),
    roc_auc_score(df_y_test, scores),
    file_name=os.path.join(image_path, "roc_curve.svg"),
)

In [24]:
plot_pr_curve(
    precision_recall_curve(df_y_test, scores),
    average_precision_score(df_y_test, scores),
    file_name=os.path.join(image_path, "pr_curve.svg"),
)

# Model Re-training
### Data Loading, Splitting, Preprocessing and Uploading

In [25]:
test_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, "test_identity.csv"))
test_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, "test_transaction.csv"))
df_test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")
df_test = df_test.rename(
    columns={"id-{:02d}".format(i): "id_{:02d}".format(i) for i in range(1, 39)}
)

df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)
df_test[cat_features] = df_test[cat_features].astype("str")

In [26]:
df_X_re_train, df_X_re_valid, df_y_re_train, df_y_re_valid = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=0.15,
    random_state=42,
    stratify=df_train["isFraud"],
)

X_re_train = processor.fit_transform(df_X_re_train, df_y_re_train)
X_re_valid = processor.transform(df_X_re_valid)
X_re_test = processor.transform(df_test[all_features])

arr_train = np.concatenate((df_y_re_train.values.reshape(-1, 1), X_re_train), axis=1)
arr_valid = np.concatenate((df_y_re_valid.values.reshape(-1, 1), X_re_valid), axis=1)
arr_test = X_re_test

In [27]:
dir_names = ["re_train", "re_valid", "re_test"]

for dir_name in dir_names:
    os.makedirs(os.path.join(proc_data_path, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(dir_names, file_names, [arr_train, arr_valid, arr_test]):
    np.savetxt(
        os.path.join(proc_data_path, dir_name, file_name) + ".csv",
        dataset,
        delimiter=",",
        fmt="%i",
    )

In [28]:
%%time
for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(proc_data_path, dir_name, file_name) + ".csv",
        BUCKET,
        BASE_JOB_PREFIX + "/" + dir_name + "/" + file_name + ".csv",
    )

CPU times: user 6.05 s, sys: 4.57 s, total: 10.6 s
Wall time: 8min 11s


## Defining and Fitting Estimator

In [29]:
full_estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)

best_params = best_estimator.hyperparameters()
_ = best_params.pop("_tuning_objective_metric")
params.update(best_params)
full_estimator.set_hyperparameters(**params)

In [30]:
re_train_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_train/", content_type="text/csv"
)
re_valid_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_valid/", content_type="text/csv"
)

full_estimator.fit({"train": re_train_input, "validation": re_valid_input})

2021-06-06 05:25:42 Starting - Starting the training job...
2021-06-06 05:26:10 Starting - Launching requested ML instancesProfilerReport-1622957139: InProgress
......
2021-06-06 05:27:16 Starting - Preparing the instances for training......
2021-06-06 05:28:14 Downloading - Downloading input data...
2021-06-06 05:28:58 Training - Training image download completed. Training in progress..[34m[2021-06-06 05:29:00.664 ip-10-0-224-89.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value gbtree to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itse

## Defining Transformer and Prediction

In [31]:
%%time
full_transformer = full_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_pred",
)

_ = full_transformer.transform(
    data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_test/",
    content_type="text/csv",
    split_type="Line",
)

..........................[34m[2021-06-06:06:26:08:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-06-06:06:26:08:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-06-06:06:26:08:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    loc

#### Downloading Prediction Scores to Local Directory

In [32]:
s3_client.download_file(
    BUCKET,
    f"{BASE_JOB_PREFIX}/re_pred/arr_test.csv.out",
    os.path.join(proc_data_path, "re_test", "arr_test.csv.out"),
)
scores = pd.read_csv(
    os.path.join(proc_data_path, "re_test", "arr_test.csv.out"), header=None
).values

In [33]:
submission = pd.DataFrame(
    {"TransactionID": df_test["TransactionID"].values, "isFraud": scores.flatten()}
)
submission.to_csv("./submission.csv", index=False)