In [1]:
%load_ext autoreload
%autoreload 2

## Requirements
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/ieee-fraud-detection).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `category_encoders`, `selenium`, `geckodriver` and `scikit-learn` version 0.24 or higher.
* You also need to set up an AWS account and install `awscli` and `sagemaker-python-sdk`.

In [2]:
import configparser
import os
import sys
import warnings
import sklearn
import boto3
import category_encoders
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_png, export_svgs, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, NumeralTickFormatter
from bokeh.plotting import figure, show
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    ContinuousParameter,
    IntegerParameter,
    HyperparameterTuner,
)
sys.path.append("..")

from utils.common import (
    dump_pickle,
    load_pickle,
)
from utils.measuring_performance import (
    get_prediction,
    plot_confusion_matrix,
    plot_pr_curve,
    plot_roc_curve,
)

warnings.filterwarnings(action="ignore")
output_notebook()

In [3]:
def is_number(x):
    try:
        float(x)
        return 1
    except ValueError:
        return 0


def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

In [4]:
config = configparser.ConfigParser()
_ = config.read(os.path.join("..", "conf", "config.ini"))

region = config["proj"]["region"]
default_bucket = config["proj"]["s3_default_bucket"]
base_job_prefix = config["proj"]["s3_base_job_prefix"]
role = config["proj"]["iam_role"]
tuning_max_jobs = eval(config["model"]["tuning_max_jobs"])
valid_size = eval(config["model"]["valid_size"])
test_size = eval(config["model"]["test_size"])

boto_session = boto3.Session()
sagemaker_session = sagemaker.Session()
account_id = boto_session.client("sts").get_caller_identity().get("Account")
role = f"arn:aws:iam::{account_id}:role/service-role/{role}"

if len(region) == 0:
    region = boto_session.region_name
if len(default_bucket) == 0:
    default_bucket = sagemaker_session.default_bucket()

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/data/ieee-fraud-detection` in advance.

In [5]:
RAW_DATA_PATH = os.path.join("..", "..", "..", "data", "ieee-fraud-detection")
PROC_DATA_PATH = os.path.join("..", "proc_data")
MODEL_PATH = os.path.join("..", "models")
IMG_PATH = os.path.join("..", "img")
ARE_PARAMS_ALREADY_TUNED = False

In [6]:
train_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, "train_identity.csv"))
train_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, "train_transaction.csv"))
df_train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

In [7]:
cat_features = pd.Index(
    [
        "ProductCD",
        "addr1",
        "addr2",
        "P_emaildomain",
        "R_emaildomain",
        "DeviceType",
        "DeviceInfo",
    ]
    + [f"card{i}" for i in range(1, 7)]
    + [f"M{i}" for i in range(1, 10)]
    + [f"id_{i}" for i in range(12, 39)]
)
num_features = df_train.columns.difference(
    pd.Index(["TransactionID", "TransactionDT", "isFraud"]) | cat_features
)
all_features = (cat_features | num_features).sort_values()

int_cat_features = df_train[cat_features].select_dtypes("number").columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)
df_train[cat_features] = df_train[cat_features].astype("str")

# Data Splitting and Preprocessing

In [8]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=test_size,
    random_state=42,
    stratify=df_train["isFraud"],
)

df_X_sub_train, df_X_valid, df_y_sub_train, df_y_valid = train_test_split(
    df_X_train,
    df_y_train,
    test_size=valid_size / (1.0 - test_size),
    random_state=42,
    stratify=df_y_train,
)

In [9]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="<unknown>"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    TargetEncoder(drop_invariant=True, min_samples_leaf=1, smoothing=1.0),
)
num_pipeline = SimpleImputer(strategy="median")
processor = make_column_transformer(
    (cat_pipeline, cat_features), (num_pipeline, num_features)
)

X_sub_train = processor.fit_transform(df_X_sub_train, df_y_sub_train)
X_valid = processor.transform(df_X_valid)
X_train = processor.fit_transform(df_X_train, df_y_train)
X_test = processor.transform(df_X_test)

arr_sub_train = np.concatenate(
    (df_y_sub_train.values.reshape(-1, 1), X_sub_train), axis=1
)
arr_valid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
arr_train = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
arr_test = X_test.copy()

In [10]:
dir_names = ["sub_train", "valid", "train", "test"]
file_names = ["arr_sub_train", "arr_valid", "arr_train", "arr_test"]

for dir_name in dir_names:
    os.makedirs(os.path.join(PROC_DATA_PATH, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(
    dir_names, file_names, [arr_sub_train, arr_valid, arr_train, arr_test]
):
    np.savetxt(
        os.path.join(PROC_DATA_PATH, dir_name, file_name + ".csv"),
        dataset,
        delimiter=",",
        fmt="%i",
    )

# Hyperparameter Tuning
#### Uploading Datasets to S3 Bucket

In [11]:
%%time
s3_client = boto_session.client("s3")

for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(PROC_DATA_PATH, dir_name, file_name + ".csv"),
        default_bucket,
        base_job_prefix + "/" + dir_name + "/" + file_name + ".csv",
    )

CPU times: user 7.07 s, sys: 5.55 s, total: 12.6 s
Wall time: 5min 28s


## Defining XGBoost Estimator

In [12]:
training_image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.3-1",
    py_version="py3",
    instance_type="ml.m5.2xlarge",
)
model_output_uri = f"s3://{default_bucket}/{base_job_prefix}/models"

estimator = Estimator(
    image_uri=training_image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None,
)

scale_pos_weight = float(df_y_train.shape[0] / df_y_train.sum() - 1.0)
params = {
    "booster": "gbtree",
    "verbosity": 0,
    "objective": "binary:logistic",
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
    "eval_metric": "auc",
    "num_round": 1000,
    "early_stopping_rounds": 10,
}
estimator.set_hyperparameters(**params)

## Defining and Fitting HyperparameterTuner

In [13]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(1, 30, scaling_type="Auto"),
    "eta": ContinuousParameter(0.01, 1.0, scaling_type="Auto"),
    "gamma": ContinuousParameter(0.0, 1.0, scaling_type="Auto"),
    "min_child_weight": ContinuousParameter(1e-06, 1.0, scaling_type="Auto"),
    "subsample": ContinuousParameter(0.1, 1.0, scaling_type="Auto"),
    "colsample_bytree": ContinuousParameter(0.1, 1.0, scaling_type="Auto"),
}

tuner = HyperparameterTuner(
    estimator,
    "validation:auc",
    hyperparameter_ranges,
    objective_type="Maximize",
    max_jobs=tuning_max_jobs,
    max_parallel_jobs=3,
    base_tuning_job_name=f"{base_job_prefix}-param-tuning",
    early_stopping_type="Auto",
)

In [14]:
%%time
os.makedirs(MODEL_PATH, exist_ok=True)

if ARE_PARAMS_ALREADY_TUNED:
    best_params = load_pickle(os.path.join(MODEL_PATH, "besk_params.pkl"))
    
else:
    sub_train_input = TrainingInput(
        s3_data=f"s3://{default_bucket}/{base_job_prefix}/sub_train/", content_type="text/csv"
    )
    valid_input = TrainingInput(
        s3_data=f"s3://{default_bucket}/{base_job_prefix}/valid/", content_type="text/csv"
    )

    tuner.fit({"train": sub_train_input, "validation": valid_input})

    best_estimator = tuner.best_estimator()
    best_params = best_estimator.hyperparameters()
    tuning_job_name = tuner.latest_tuning_job.name
    
    _ = best_params.pop("_tuning_objective_metric")
    dump_pickle(os.path.join(MODEL_PATH, "besk_params.pkl"), best_params)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Analyzing Hyperparameter Tuning Results

In [15]:
if not ARE_PARAMS_ALREADY_TUNED:
    tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
    df_viz = tuning_job_analytics.dataframe()
    df_viz = df_viz.sort_values("TrainingStartTime")
    df_viz.index = range(df_viz.shape[0])

    os.makedirs(IMG_PATH, exist_ok=True)
    display(df_viz.sort_values("FinalObjectiveValue", ascending=False)[:10])

Unnamed: 0,colsample_bytree,eta,gamma,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
23,0.888999,0.048964,0.116486,22.0,0.000198,0.934456,ieee-fraud-detection-220201-0851-024-ce71bdb4,Completed,0.97132,2022-02-01 11:31:11+09:00,2022-02-01 12:18:41+09:00,2850.0
16,0.558118,0.036888,0.27647,26.0,0.047425,0.887737,ieee-fraud-detection-220201-0851-017-01f8a865,Completed,0.97061,2022-02-01 10:19:16+09:00,2022-02-01 10:57:13+09:00,2277.0
18,0.87153,0.037743,0.600964,24.0,1e-06,0.877708,ieee-fraud-detection-220201-0851-019-0b14b5e3,Completed,0.97054,2022-02-01 10:32:10+09:00,2022-02-01 11:26:23+09:00,3253.0
29,0.722858,0.015886,0.859783,29.0,0.034503,0.797733,ieee-fraud-detection-220201-0851-030-8d3a1d80,Completed,0.96943,2022-02-01 12:17:26+09:00,2022-02-01 13:42:50+09:00,5124.0
27,0.837738,0.02025,0.928936,28.0,4e-06,0.923003,ieee-fraud-detection-220201-0851-028-a3b7fbac,Completed,0.96911,2022-02-01 12:01:05+09:00,2022-02-01 13:50:09+09:00,6544.0
12,0.816754,0.08486,0.624859,25.0,1.3e-05,0.978484,ieee-fraud-detection-220201-0851-013-92adb6ce,Completed,0.96859,2022-02-01 09:51:43+09:00,2022-02-01 10:15:35+09:00,1432.0
21,0.43216,0.06569,0.178875,29.0,0.127276,0.777599,ieee-fraud-detection-220201-0851-022-5581b53c,Completed,0.96857,2022-02-01 11:20:22+09:00,2022-02-01 11:39:02+09:00,1120.0
28,0.551691,0.170068,0.092938,21.0,0.517752,0.925255,ieee-fraud-detection-220201-0851-029-9127925e,Completed,0.96786,2022-02-01 12:04:40+09:00,2022-02-01 12:14:24+09:00,584.0
10,0.878699,0.023822,0.994,30.0,0.173732,0.958726,ieee-fraud-detection-220201-0851-011-ad633159,Completed,0.96756,2022-02-01 09:24:47+09:00,2022-02-01 11:04:41+09:00,5994.0
24,0.730329,0.172185,0.056092,29.0,4e-06,0.964586,ieee-fraud-detection-220201-0851-025-00163760,Completed,0.96689,2022-02-01 11:43:37+09:00,2022-02-01 12:01:42+09:00,1085.0


In [16]:
class HoverHelper:
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ("TrainingJobName", "@TrainingJobName"),
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
        ]

        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, f"@{key}"))

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(
        self,
        standard_tools="pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset",
    ):
        return [self.hovertool(), standard_tools]


def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        if (i % n_cols == n_cols - 1) or (i == len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [17]:
if not ARE_PARAMS_ALREADY_TUNED:
    hover_helper = HoverHelper(tuning_job_analytics)

    p = figure(
        plot_width=800,
        plot_height=400,
        tools=hover_helper.tools(),
        title="Convergence Plot",
        x_axis_type="datetime",
        x_axis_label="Training Start Time",
        y_axis_label="AUROC",
    )
    _ = p.line(
        x="TrainingStartTime",
        y="FinalObjectiveValue",
        color="coral",
        line_width=1.5,
        source=df_viz,
    )
    _ = p.circle(
        x="TrainingStartTime",
        y="FinalObjectiveValue",
        line_color="coral",
        line_width=1.5,
        fill_color="white",
        source=df_viz,
    )

    p.xgrid.grid_line_color = None
    p.yaxis.formatter = NumeralTickFormatter(format="0.0%")
    p.title.align = "center"
    p.title.text_font_size = "12pt"

    show(p)

    p.output_backend = "svg"
    _ = export_svgs(p, filename=os.path.join(IMG_PATH, "convergence_plot.svg"))

In [18]:
if not ARE_PARAMS_ALREADY_TUNED:
    df_viz = df_viz.reset_index()
    df_viz["index"] = (df_viz["index"] + df_viz["index"].min()) / (
        df_viz["index"].max() - df_viz["index"].min()
    )

    figures = []
    for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
        categorical_args = dict()
        if param_range.get("Values"):
            values = param_range["Values"]
            if sum([is_number(x) for x in values]) == len(values):
                print(
                    f"Hyperparameter {param_name} is tuned as categorical, but all values are numeric."
                )
            else:
                categorical_args["x_range"] = values

        p = figure(
            plot_width=400,
            plot_height=400,
            tools=hover_helper.tools(),
            x_axis_label=param_name,
            y_axis_label="AUROC",
            **categorical_args,
        )
        p.circle(
            source=df_viz,
            x=param_name,
            y="FinalObjectiveValue",
            color="black",
            alpha="index",
        )
        p.xgrid.grid_line_color = None
        p.yaxis.formatter = NumeralTickFormatter(format="0.0%")
        figures.append(p)

    grid_plot = gridplot(make_grid(figures, 3), toolbar_location="right")

    show(grid_plot)

    _ = export_png(
        grid_plot, filename=os.path.join(IMG_PATH, "partial_dependence_plot.png")
    )

# Model Evaluation
## Defining and Fitting Estimator

In [19]:
%%time
%%capture
params.update(best_params)
estimator.set_hyperparameters(**params)

train_input = TrainingInput(
    s3_data=f"s3://{default_bucket}/{base_job_prefix}/train/", content_type="text/csv"
)
estimator.fit({"train": train_input})

CPU times: user 17 s, sys: 1.66 s, total: 18.6 s
Wall time: 1h 55min 56s


## Defining Transformer and Prediction

In [20]:
%%time
%%capture
transformer = estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{default_bucket}/{base_job_prefix}/pred",
)

_ = transformer.transform(
    data=f"s3://{default_bucket}/{base_job_prefix}/test/",
    content_type="text/csv",
    split_type="Line",
)

CPU times: user 749 ms, sys: 101 ms, total: 850 ms
Wall time: 5min 24s


#### Downloading Prediction Scores to Local Directory

In [21]:
s3_client.download_file(
    default_bucket,
    f"{base_job_prefix}/pred/arr_test.csv.out",
    os.path.join(PROC_DATA_PATH, "test", "arr_test.csv.out"),
)
scores = pd.read_csv(
    os.path.join(PROC_DATA_PATH, "test", "arr_test.csv.out"), header=None
).values

## Measuring Predictive Performance

In [22]:
predictions = get_prediction(scores)

plot_confusion_matrix(
    confusion_matrix(df_y_test, predictions),
    file_name=os.path.join(IMG_PATH, "conf_mat.svg"),
)

print(
    f"ACCURACY: {accuracy_score(df_y_test, predictions):.2%}, \
    PRECISION: {precision_score(df_y_test, predictions):.2%}, \
    RECALL: {recall_score(df_y_test, predictions):.2%}, \
    F1: {f1_score(df_y_test, predictions):.2%}"
)

ACCURACY: 98.86%,     PRECISION: 93.80%,     RECALL: 72.25%,     F1: 81.63%


In [23]:
plot_roc_curve(
    roc_curve(df_y_test, scores),
    roc_auc_score(df_y_test, scores),
    file_name=os.path.join(IMG_PATH, "roc_curve.svg"),
)

In [24]:
plot_pr_curve(
    precision_recall_curve(df_y_test, scores),
    average_precision_score(df_y_test, scores),
    file_name=os.path.join(IMG_PATH, "pr_curve.svg"),
)

# Model Re-training
### Data Loading, Splitting, Preprocessing and Uploading

In [25]:
test_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, "test_identity.csv"))
test_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, "test_transaction.csv"))
df_test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")
df_test = df_test.rename(
    columns={"id-{:02d}".format(i): "id_{:02d}".format(i) for i in range(1, 39)}
)

df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)
df_test[cat_features] = df_test[cat_features].astype("str")

In [26]:
df_X_re_train, df_X_re_valid, df_y_re_train, df_y_re_valid = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=test_size,
    random_state=42,
    stratify=df_train["isFraud"],
)

X_re_train = processor.fit_transform(df_X_re_train, df_y_re_train)
X_re_valid = processor.transform(df_X_re_valid)
X_re_test = processor.transform(df_test[all_features])

arr_re_train = np.concatenate((df_y_re_train.values.reshape(-1, 1), X_re_train), axis=1)
arr_re_valid = np.concatenate((df_y_re_valid.values.reshape(-1, 1), X_re_valid), axis=1)
arr_re_test = X_re_test.copy()

In [27]:
dir_names = ["re_train", "re_valid", "re_test"]
file_names = ["arr_re_train", "arr_re_valid", "arr_re_test"]

for dir_name in dir_names:
    os.makedirs(os.path.join(PROC_DATA_PATH, dir_name), exist_ok=True)

for dir_name, file_name, dataset in zip(
    dir_names, file_names, [arr_re_train, arr_re_valid, arr_re_test]
):
    np.savetxt(
        os.path.join(PROC_DATA_PATH, dir_name, file_name) + ".csv",
        dataset,
        delimiter=",",
        fmt="%i",
    )

In [28]:
%%time
for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(
        os.path.join(PROC_DATA_PATH, dir_name, file_name + ".csv"),
        default_bucket,
        base_job_prefix + "/" + dir_name + "/" + file_name + ".csv",
    )

CPU times: user 7.41 s, sys: 5.97 s, total: 13.4 s
Wall time: 5min 51s


## Defining and Fitting Estimator

In [29]:
%%time
%%capture
params.update(best_params)
estimator.set_hyperparameters(**params)

re_train_input = TrainingInput(
    s3_data=f"s3://{default_bucket}/{base_job_prefix}/re_train/", content_type="text/csv"
)
re_valid_input = TrainingInput(
    s3_data=f"s3://{default_bucket}/{base_job_prefix}/re_valid/", content_type="text/csv"
)

estimator.fit({"train": re_train_input, "validation": re_valid_input})

CPU times: user 7.07 s, sys: 726 ms, total: 7.8 s
Wall time: 49min 33s


## Defining Transformer and Prediction

In [30]:
%%time
%%capture
transformer = estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{default_bucket}/{base_job_prefix}/re_pred",
)

_ = transformer.transform(
    data=f"s3://{default_bucket}/{base_job_prefix}/re_test/",
    content_type="text/csv",
    split_type="Line",
)

CPU times: user 852 ms, sys: 107 ms, total: 959 ms
Wall time: 6min 7s


#### Downloading Prediction Scores to Local Directory

In [31]:
s3_client.download_file(
    default_bucket,
    f"{base_job_prefix}/re_pred/arr_re_test.csv.out",
    os.path.join(PROC_DATA_PATH, "re_test", "arr_re_test.csv.out"),
)
scores = pd.read_csv(
    os.path.join(PROC_DATA_PATH, "re_test", "arr_re_test.csv.out"), header=None
).values

submission = pd.DataFrame(
    {"TransactionID": df_test["TransactionID"].values, "isFraud": scores.flatten()}
)
submission.to_csv(os.path.join(PROC_DATA_PATH, "submission.csv"), index=False)

In [32]:
columns = ["isFraud"] + all_features.tolist()
model_name = transformer.model_name
baseline = [arr_re_train[:, 1:].mean(axis=0).tolist()]

dump_pickle(
    os.path.join(MODEL_PATH, "clarify.pkl"), [columns, model_name, baseline]
)