In [1]:
%load_ext autoreload
%autoreload 2

## Requirements
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/ieee-fraud-detection).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `category_encoders`, `selenium`, `geckodriver` and `scikit-learn` version 0.24 or higher.
* You also need to set up an AWS account and install `awscli` and `sagemaker-python-sdk`.

In [2]:
import os
import warnings
import sklearn
import boto3
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_png, export_svgs, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, NumeralTickFormatter
from bokeh.plotting import figure, show
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
)
from utils.measuring_performance import (
    get_prediction,
    plot_confusion_matrix,
    plot_pr_curve,
    plot_roc_curve,
)

warnings.filterwarnings(action="ignore")
output_notebook()
print(f"<VERSION>\nsklearn: {sklearn.__version__}, sagemaker: {sagemaker.__version__}")

<VERSION>
sklearn: 0.24.1, sagemaker: 2.48.1


In [3]:
def is_number(x):
    try:
        float(x)
        return 1
    except ValueError:
        return 0


def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/data/ieee-fraud-detection` in advance.

In [4]:
RAW_DATA_PATH = "../../data/ieee-fraud-detection"
PROC_DATA_PATH = "proc_data"

In [5]:
train_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, "train_identity.csv"))
train_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, "train_transaction.csv"))
df_train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

In [6]:
cat_features = pd.Index(
    [
        "ProductCD",
        "addr1",
        "addr2",
        "P_emaildomain",
        "R_emaildomain",
        "DeviceType",
        "DeviceInfo",
    ]
    + [f"card{i}" for i in range(1, 7)]
    + [f"M{i}" for i in range(1, 10)]
    + [f"id_{i}" for i in range(12, 39)]
)
num_features = df_train.columns.difference(
    pd.Index(["TransactionID", "TransactionDT", "isFraud"]) | cat_features
)
all_features = cat_features | num_features

int_cat_features = df_train[cat_features].select_dtypes("number").columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)
df_train[cat_features] = df_train[cat_features].astype("str")

# Data Splitting and Preprocessing

In [7]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=0.1,
    random_state=42,
    stratify=df_train["isFraud"],
)

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_X_train, df_y_train, test_size=0.15, random_state=42, stratify=df_y_train
)

In [8]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="<unknown>"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    TargetEncoder(drop_invariant=True, min_samples_leaf=1, smoothing=1.0),
)
num_pipeline = SimpleImputer(strategy="median")
processor = make_column_transformer(
    (cat_pipeline, cat_features), (num_pipeline, num_features)
)

X_train = processor.fit_transform(df_X_train, df_y_train)
X_valid = processor.transform(df_X_valid)
X_test = processor.transform(df_X_test)

arr_train = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
arr_valid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
arr_test = X_test

In [9]:
dir_names = ["train", "valid", "test"]
file_names = ["arr_train", "arr_valid", "arr_test"]

for dir_name in dir_names:
    os.makedirs(os.path.join(PROC_DATA_PATH, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(dir_names, file_names, [arr_train, arr_valid, arr_test]):
    np.savetxt(
        os.path.join(PROC_DATA_PATH, dir_name, file_name) + ".csv",
        dataset,
        delimiter=",",
        fmt="%i",
    )

# Hyperparameter Tuning
#### Uploading Datasets to S3 Bucket

In [10]:
sagemaker_session = sagemaker.session.Session()
BUCKET = sagemaker_session.default_bucket()
BASE_JOB_PREFIX = "ieee-fraud-detection"

region = boto3.Session().region_name
role = sagemaker.get_execution_role()

In [11]:
%%time
s3_client = boto3.client("s3")

for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(PROC_DATA_PATH, dir_name, file_name) + ".csv",
        BUCKET,
        BASE_JOB_PREFIX + "/" + dir_name + "/" + file_name + ".csv",
    )

CPU times: user 3.79 s, sys: 2.81 s, total: 6.6 s
Wall time: 2min 55s


## Defining XGBoost Estimator

In [12]:
training_image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.2-1",
    py_version="py3",
    instance_type="ml.m5.2xlarge",
)
model_output_uri = f"s3://{BUCKET}/{BASE_JOB_PREFIX}/models"

estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)

scale_pos_weight = float(df_y_train.shape[0] / df_y_train.sum() - 1.0)
params = {
    "booster": "gbtree",
    "verbosity": 0,
    "objective": "binary:logistic",
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
    "eval_metric": "auc",
    "num_round": 1000,
    "early_stopping_rounds": 10,
}
estimator.set_hyperparameters(**params)

## Defining and Fitting HyperparameterTuner

In [13]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(1, 30, scaling_type="Auto"),
    "eta": ContinuousParameter(0.01, 1.0, scaling_type="Auto"),
    "gamma": ContinuousParameter(0.0, 1.0, scaling_type="Auto"),
    "min_child_weight": ContinuousParameter(1e-06, 1.0, scaling_type="Auto"),
    "subsample": ContinuousParameter(0.1, 1.0, scaling_type="Auto"),
    "colsample_bytree": ContinuousParameter(0.1, 1.0, scaling_type="Auto"),
}

tuner = HyperparameterTuner(
    estimator,
    "validation:auc",
    hyperparameter_ranges,
    objective_type="Maximize",
    max_jobs=30,
    max_parallel_jobs=3,
    base_tuning_job_name=f"{BASE_JOB_PREFIX}-xgb-hpo",
    early_stopping_type="Auto",
)

In [14]:
%%time
train_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/train/", content_type="text/csv"
)
valid_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/valid/", content_type="text/csv"
)

tuner.fit({"train": train_input, "validation": valid_input})

best_estimator = tuner.best_estimator()
best_params = best_estimator.hyperparameters()
tuning_job_name = tuner.latest_tuning_job.name

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Analyzing Hyperparameter Tuning Results

In [15]:
tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
df_viz = tuning_job_analytics.dataframe()
df_viz = df_viz.sort_values("TrainingStartTime")
df_viz.index = range(df_viz.shape[0])

IMAGE_PATH = "img"
os.makedirs(IMAGE_PATH, exist_ok=True)

In [16]:
df_viz.sort_values("FinalObjectiveValue", ascending=False)[:10]

Unnamed: 0,colsample_bytree,eta,gamma,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
16,0.794138,0.020849,0.184939,24.0,0.00041,0.932148,ieee-fraud-detection-210826-1115-017-37235e82,Completed,0.97135,2021-08-26 14:20:09+09:00,2021-08-26 15:48:09+09:00,5280.0
26,0.812769,0.018806,0.411635,26.0,0.007369,0.955027,ieee-fraud-detection-210826-1115-026-5817e2f9,Completed,0.97017,2021-08-26 17:28:18+09:00,2021-08-26 19:04:38+09:00,5780.0
20,0.376465,0.01374,0.238253,27.0,0.011434,0.995512,ieee-fraud-detection-210826-1115-021-adcb71f2,Completed,0.97012,2021-08-26 15:20:13+09:00,2021-08-26 16:27:26+09:00,4033.0
8,0.826245,0.018179,0.661329,27.0,0.001476,0.784794,ieee-fraud-detection-210826-1115-009-911e79f7,Completed,0.97011,2021-08-26 11:39:17+09:00,2021-08-26 13:02:05+09:00,4968.0
21,0.790567,0.050631,0.198306,27.0,0.037815,0.833326,ieee-fraud-detection-210826-1115-022-0dd824b2,Completed,0.97,2021-08-26 15:55:11+09:00,2021-08-26 16:36:52+09:00,2501.0
28,0.812769,0.017151,0.391635,26.0,0.00559,0.955027,ieee-fraud-detection-210826-1115-029-c4918e9a,Completed,0.96998,2021-08-26 17:32:58+09:00,2021-08-26 19:27:40+09:00,6882.0
13,0.763056,0.020676,0.754733,29.0,0.000351,0.899179,ieee-fraud-detection-210826-1115-014-f6095358,Completed,0.96989,2021-08-26 13:28:22+09:00,2021-08-26 15:11:54+09:00,6212.0
7,0.8728,0.011234,0.015506,24.0,0.001894,0.82624,ieee-fraud-detection-210826-1115-008-f4976045,Completed,0.96978,2021-08-26 11:38:35+09:00,2021-08-26 13:37:53+09:00,7158.0
11,0.68606,0.040841,0.912326,29.0,1e-06,0.879734,ieee-fraud-detection-210826-1115-012-e63c90be,Completed,0.96969,2021-08-26 12:31:53+09:00,2021-08-26 13:23:50+09:00,3117.0
22,0.516376,0.019519,0.337172,29.0,8e-06,0.745084,ieee-fraud-detection-210826-1115-023-53b6d10f,Completed,0.96945,2021-08-26 16:31:16+09:00,2021-08-26 17:30:14+09:00,3538.0


In [17]:
class HoverHelper:
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ("TrainingJobName", "@TrainingJobName"),
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
        ]

        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, f"@{key}"))

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(
        self,
        standard_tools="pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset",
    ):
        return [self.hovertool(), standard_tools]


def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        if (i % n_cols == n_cols - 1) or (i == len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [18]:
hover_helper = HoverHelper(tuning_job_analytics)

p = figure(
    plot_width=800,
    plot_height=400,
    tools=hover_helper.tools(),
    title="Convergence Plot",
    x_axis_type="datetime",
    x_axis_label="Training Start Time",
    y_axis_label="AUROC",
)
_ = p.line(
    x="TrainingStartTime",
    y="FinalObjectiveValue",
    color="coral",
    line_width=1.5,
    source=df_viz,
)
_ = p.circle(
    x="TrainingStartTime",
    y="FinalObjectiveValue",
    line_color="coral",
    line_width=1.5,
    fill_color="white",
    source=df_viz,
)

p.xgrid.grid_line_color = None
p.yaxis.formatter = NumeralTickFormatter(format="0.0%")
p.title.align = "center"
p.title.text_font_size = "12pt"

show(p)

p.output_backend = "svg"
_ = export_svgs(p, filename=f"{IMAGE_PATH}/convergence_plot.svg")

In [19]:
df_viz = df_viz.reset_index()
df_viz["index"] = (df_viz["index"] + df_viz["index"].min()) / (
    df_viz["index"].max() - df_viz["index"].min()
)

figures = []
for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
    categorical_args = dict()
    if param_range.get("Values"):
        values = param_range["Values"]
        if sum([is_number(x) for x in values]) == len(values):
            print(
                f"Hyperparameter {param_name} is tuned as categorical, but all values are numeric."
            )
        else:
            categorical_args["x_range"] = values

    p = figure(
        plot_width=400,
        plot_height=400,
        tools=hover_helper.tools(),
        x_axis_label=param_name,
        y_axis_label="AUROC",
        **categorical_args,
    )
    p.circle(
        source=df_viz,
        x=param_name,
        y="FinalObjectiveValue",
        color="black",
        alpha="index",
    )
    p.xgrid.grid_line_color = None
    p.yaxis.formatter = NumeralTickFormatter(format="0.0%")
    figures.append(p)

grid_plot = gridplot(make_grid(figures, 3), toolbar_location="right")

show(grid_plot)

_ = export_png(grid_plot, filename=f"{IMAGE_PATH}/partial_dependence_plot.png")

# Model Evaluation
## Defining Transformer and Prediction

In [20]:
%%time
transformer = best_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/pred",
)

_ = transformer.transform(
    data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/test/",
    content_type="text/csv",
    split_type="Line",
)

...........................[34m[2021-08-26:10:41:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-26:10:41:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-26:10:41:35:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    lo

#### Downloading Prediction Scores to Local Directory

In [21]:
s3_client.download_file(
    BUCKET,
    f"{BASE_JOB_PREFIX}/pred/arr_test.csv.out",
    os.path.join(PROC_DATA_PATH, "test", "arr_test.csv.out"),
)
scores = pd.read_csv(
    os.path.join(PROC_DATA_PATH, "test", "arr_test.csv.out"), header=None
).values

## Measuring Predictive Performance

In [22]:
predictions = get_prediction(scores)

plot_confusion_matrix(
    confusion_matrix(df_y_test, predictions),
    file_name=os.path.join(IMAGE_PATH, "conf_mat.svg"),
)

print(
    f"ACCURACY: {accuracy_score(df_y_test, predictions):.2%}, \
    PRECISION: {precision_score(df_y_test, predictions):.2%}, \
    RECALL: {recall_score(df_y_test, predictions):.2%}, \
    F1: {f1_score(df_y_test, predictions):.2%}"
)

ACCURACY: 98.64%,     PRECISION: 91.40%,     RECALL: 67.38%,     F1: 77.57%


In [23]:
plot_roc_curve(
    roc_curve(df_y_test, scores),
    roc_auc_score(df_y_test, scores),
    file_name=os.path.join(IMAGE_PATH, "roc_curve.svg"),
)

In [24]:
plot_pr_curve(
    precision_recall_curve(df_y_test, scores),
    average_precision_score(df_y_test, scores),
    file_name=os.path.join(IMAGE_PATH, "pr_curve.svg"),
)

# Model Re-training
### Data Loading, Splitting, Preprocessing and Uploading

In [25]:
test_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, "test_identity.csv"))
test_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, "test_transaction.csv"))
df_test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")
df_test = df_test.rename(
    columns={"id-{:02d}".format(i): "id_{:02d}".format(i) for i in range(1, 39)}
)

df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)
df_test[cat_features] = df_test[cat_features].astype("str")

In [26]:
df_X_re_train, df_X_re_valid, df_y_re_train, df_y_re_valid = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=0.15,
    random_state=42,
    stratify=df_train["isFraud"],
)

X_re_train = processor.fit_transform(df_X_re_train, df_y_re_train)
X_re_valid = processor.transform(df_X_re_valid)
X_re_test = processor.transform(df_test[all_features])

arr_train = np.concatenate((df_y_re_train.values.reshape(-1, 1), X_re_train), axis=1)
arr_valid = np.concatenate((df_y_re_valid.values.reshape(-1, 1), X_re_valid), axis=1)
arr_test = X_re_test

In [27]:
dir_names = ["re_train", "re_valid", "re_test"]

for dir_name in dir_names:
    os.makedirs(os.path.join(PROC_DATA_PATH, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(dir_names, file_names, [arr_train, arr_valid, arr_test]):
    np.savetxt(
        os.path.join(PROC_DATA_PATH, dir_name, file_name) + ".csv",
        dataset,
        delimiter=",",
        fmt="%i",
    )

In [28]:
%%time
for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(PROC_DATA_PATH, dir_name, file_name) + ".csv",
        BUCKET,
        BASE_JOB_PREFIX + "/" + dir_name + "/" + file_name + ".csv",
    )

CPU times: user 6.21 s, sys: 4.85 s, total: 11.1 s
Wall time: 4min 44s


## Defining and Fitting Estimator

In [29]:
full_estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)

best_params = best_estimator.hyperparameters()
_ = best_params.pop("_tuning_objective_metric")
params.update(best_params)
full_estimator.set_hyperparameters(**params)

In [30]:
re_train_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_train/", content_type="text/csv"
)
re_valid_input = TrainingInput(
    s3_data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_valid/", content_type="text/csv"
)

full_estimator.fit({"train": re_train_input, "validation": re_valid_input})

2021-08-26 10:50:55 Starting - Starting the training job...
2021-08-26 10:51:28 Starting - Launching requested ML instancesProfilerReport-1629975052: InProgress
...
2021-08-26 10:51:53 Starting - Preparing the instances for training.........
2021-08-26 10:53:41 Downloading - Downloading input data
2021-08-26 10:53:41 Training - Downloading the training image...
2021-08-26 10:54:01 Training - Training image download completed. Training in progress.[34m[2021-08-26 10:53:59.365 ip-10-0-152-112.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value gbtree to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective va

## Defining Transformer and Prediction

In [31]:
%%time
full_transformer = full_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_pred",
)

_ = full_transformer.transform(
    data=f"s3://{BUCKET}/{BASE_JOB_PREFIX}/re_test/",
    content_type="text/csv",
    split_type="Line",
)

...........................[34m[2021-08-26:12:42:31:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-26:12:42:31:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-08-26:12:42:31:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    lo

#### Downloading Prediction Scores to Local Directory

In [32]:
s3_client.download_file(
    BUCKET,
    f"{BASE_JOB_PREFIX}/re_pred/arr_test.csv.out",
    os.path.join(PROC_DATA_PATH, "re_test", "arr_test.csv.out"),
)
scores = pd.read_csv(
    os.path.join(PROC_DATA_PATH, "re_test", "arr_test.csv.out"), header=None
).values

In [33]:
submission = pd.DataFrame(
    {"TransactionID": df_test["TransactionID"].values, "isFraud": scores.flatten()}
)
submission.to_csv("./submission.csv", index=False)