In [1]:
import os
from typing import Literal, Union

import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn.functional as F
import wandb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

from deepod.models.icl import ICLNet
from omegaconf import OmegaConf
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ModelSummary,
    RichProgressBar,
)
from pytorch_lightning.loggers import WandbLogger
from pytorch_tabular.config import DataConfig
from pytorch_tabular.tabular_datamodule import TabularDatamodule
from torchmetrics.classification import BinaryAUROC, BinaryF1Score


In [2]:
BATCH_SIZE = 2048
SEED = 42

if os.environ.get("USER") is not None and os.environ.get("SLURM_JOB_ID") is not None:
    CKPT_DIR = f"/checkpoint/{os.environ['USER']}/{os.environ['SLURM_JOB_ID']}"
    DATA_DIR = "/ssd003/projects/aieng/public/ssl_bootcamp_resources/datasets/bank_account_fraud_dataset/"
else:  # not on Vector's cluster
    CKPT_DIR = "checkpoint"
    DATA_DIR = "data"  # NOTE: download the data from https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022

seed_everything(SEED)


[rank: 0] Global seed set to 42


42

## Data Preprocessing and Loading

In [3]:
df = pd.read_csv(os.path.join(DATA_DIR, "Base.csv"))


In [4]:
continuous_cols = [
    "income",
    "name_email_similarity",
    "prev_address_months_count",
    "current_address_months_count",
    "customer_age",
    "days_since_request",
    "intended_balcon_amount",
    "zip_count_4w",
    "velocity_6h",
    "velocity_24h",
    "velocity_4w",
    "bank_branch_count_8w",
    "date_of_birth_distinct_emails_4w",
    "credit_risk_score",
    "bank_months_count",
    "proposed_credit_limit",
    "session_length_in_minutes",
    "device_distinct_emails_8w",
    "device_fraud_count",
]

categorical_cols = [
    "payment_type",
    "employment_status",
    "housing_status",
    "device_os",
    "email_is_free",
    "phone_home_valid",
    "phone_mobile_valid",
    "has_other_cards",
    "foreign_request",
    "keep_alive_session",
    "source",
]

target_col = ["fraud_bool"]


In [5]:
# one-hot encode categorical columns in for loop
df_concat_lst = []
for col in categorical_cols:
    one_hot = pd.get_dummies(df[col], prefix=col)
    df = df.drop(col, axis=1)
    df_concat_lst.append(one_hot)

df = pd.concat([df] + df_concat_lst, axis=1)

df.tail()  # noqa: B018


Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,...,phone_mobile_valid_0,phone_mobile_valid_1,has_other_cards_0,has_other_cards_1,foreign_request_0,foreign_request_1,keep_alive_session_0,keep_alive_session_1,source_INTERNET,source_TELEAPP
999995,0,0.6,0.192631,-1,104,40,0.030592,-1.044454,804,7905.711839,...,False,True,True,False,True,False,False,True,True,False
999996,0,0.8,0.322989,148,9,50,1.628119,-1.409803,3306,5391.470463,...,False,True,True,False,True,False,True,False,True,False
999997,0,0.8,0.879403,-1,30,20,0.018563,34.69276,1522,8063.102636,...,False,True,True,False,True,False,True,False,True,False
999998,0,0.9,0.762112,-1,189,20,0.015352,94.661055,1418,8092.641762,...,True,False,True,False,True,False,False,True,True,False
999999,0,0.2,0.697452,-1,321,20,2.655916,9.908499,951,6169.630036,...,False,True,True,False,True,False,True,False,True,False


In [6]:
# update the categorical columns
categorical_cols = [
    col for col in df.columns if col.startswith(tuple(categorical_cols))
]
print(categorical_cols)

['payment_type_AA', 'payment_type_AB', 'payment_type_AC', 'payment_type_AD', 'payment_type_AE', 'employment_status_CA', 'employment_status_CB', 'employment_status_CC', 'employment_status_CD', 'employment_status_CE', 'employment_status_CF', 'employment_status_CG', 'housing_status_BA', 'housing_status_BB', 'housing_status_BC', 'housing_status_BD', 'housing_status_BE', 'housing_status_BF', 'housing_status_BG', 'device_os_linux', 'device_os_macintosh', 'device_os_other', 'device_os_windows', 'device_os_x11', 'email_is_free_0', 'email_is_free_1', 'phone_home_valid_0', 'phone_home_valid_1', 'phone_mobile_valid_0', 'phone_mobile_valid_1', 'has_other_cards_0', 'has_other_cards_1', 'foreign_request_0', 'foreign_request_1', 'keep_alive_session_0', 'keep_alive_session_1', 'source_INTERNET', 'source_TELEAPP']


In [7]:
# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(
#     df, test_size=0.2, random_state=SEED, stratify=df[target_col]
# )

# drop the month column, but keep it for later
month = df["month"].values
df = df.drop(columns=["month"])

# select the first 6 months of data for training and the last 2 months for testing
train_df = df[month < 6]
test_df = df[month >= 6]


In [8]:
data_cfg = DataConfig(
    target=target_col,
    continuous_cols=continuous_cols,
    categorical_cols=categorical_cols,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True,
    validation_split=0.3,
)
data_cfg = OmegaConf.structured(data_cfg)

cfg = OmegaConf.merge(
    OmegaConf.to_container(data_cfg),
    OmegaConf.create(
        {
            "batch_size": BATCH_SIZE,
            "task": "classification",
            "num_workers": 4,
            "pin_memory": True,
        }
    ),
)

data = TabularDatamodule(train=train_df, test=test_df, config=cfg, seed=SEED)


## Model

In [9]:
class ICLModule(LightningModule):
    """Lightning module for ICL model.

    Parameters
    ----------
    n_features : int
        The number of features in the data.
    rep_dim : int, optional, default=128
        The dimension of the representation.
    hidden_dims : tuple, optional, default=(128, 64)
        The number of hidden units in each layer of the encoders.
    lr : float, optional, default=1e-3
        The learning rate.
    act : str, optional, default="LeakyReLU"
        The activation function.
    bias : bool, optional, default=False
        Whether to use bias in the encoders.
    kernel_size : int or str, optional, default="auto"
        The kernel size of the sliding window.
    temperature : float, optional, default=0.01
        The temperature parameter for the softmax function.
    max_negatives : int, optional, default=1000
        The maximum number of negative samples.
    contamination : float, optional, default=0.01
        The contamination rate.
    train_method : str, optional, default="loe_hard"
        The training method. Choose from "blind", "loe_hard", "loe_soft", "refine", or "gt".
        - "blind": train the model without any labels.
        - "loe_hard": train the model with the hard labels from the LOE.
        - "loe_soft": train the model with the soft labels from the LOE.
        - "refine": use the loss on the normal samples to rank the samples and
            select the top (1 - contamination) as anomalies.
        - "gt": use the ground truth labels to train the model.

    """

    def __init__(
        self,
        n_features: int,
        rep_dim: int = 128,
        hidden_dims: tuple = (128, 64),
        lr: float = 1e-3,
        act: str = "LeakyReLU",
        bias: bool = False,
        kernel_size: Union[int, str] = "auto",
        temperature: float = 0.01,
        max_negatives: int = 1000,
        contamination: float = 0.01,
        train_method: Literal[
            "blind", "loe_hard", "loe_soft", "refine", "gt"
        ] = "loe_hard",
    ) -> None:
        """Initialize the ICL lightning module."""
        super().__init__()

        self.save_hyperparameters()
        assert train_method in [
            "blind",
            "loe_hard",
            "loe_soft",
            "refine",
            "gt",
        ], "Unknown training method. Please choose from 'blind', 'loe_hard', 'loe_soft', 'refine', or 'gt'."

        if n_features < 3:
            raise ValueError(
                "ICL model cannot handle the data that have less than three features."
            )

        if kernel_size == "auto":
            if n_features < 40:
                kernel_size = 2
            elif 40 <= n_features <= 160:
                kernel_size = 10
            elif 160 < n_features <= 240:
                kernel_size = n_features - 150
            elif 240 < n_features <= 480:
                kernel_size = n_features - 200
            else:
                kernel_size = n_features - 400

        print("Number of features:", n_features)
        print("Kernel size:", kernel_size)

        self.model = ICLNet(
            n_features=n_features,
            kernel_size=kernel_size,
            hidden_dims=hidden_dims,
            rep_dim=rep_dim,
            activation=act,
            bias=bias,
        )

        self.contamination = contamination
        self.train_method = train_method
        self.max_negatives = max_negatives
        self.tau = temperature

        self.auroc = BinaryAUROC(thresholds=10)
        self.f1_score = BinaryF1Score(threshold=0.5)

    def _cal_logit(self, query: torch.Tensor, pos: torch.Tensor) -> torch.Tensor:
        """Calculate the logits for the query and positive samples."""
        n_pos = query.shape[1]
        batch_size = query.shape[0]

        # get negatives
        negative_index = np.random.choice(
            np.arange(n_pos), min(self.max_negatives, n_pos), replace=False
        )
        negative = pos.permute(0, 2, 1)[:, :, negative_index]

        pos_multiplication = (query * pos).sum(dim=2).unsqueeze(2)

        neg_multiplication = torch.matmul(query, negative)  # [batch_size, n_neg, n_neg]

        # Removal of the diagonals
        identity_matrix = torch.eye(n_pos).unsqueeze(0).to(self.device)
        identity_matrix = identity_matrix.repeat(batch_size, 1, 1)
        identity_matrix = identity_matrix[:, :, negative_index]

        neg_multiplication.masked_fill_(identity_matrix == 1, -float("inf"))

        logit = torch.cat((pos_multiplication, neg_multiplication), dim=2)
        logit = torch.div(logit, self.tau)

        return logit

    def forward(self, batch: dict, loss_reduction: str = "mean") -> torch.Tensor:
        """Run forward pass."""
        # combine batch['continuous'] and batch['categorical'] into x
        cont_x = batch["continuous"]
        cat_x = batch["categorical"]
        x = torch.cat((cont_x, cat_x), dim=1)

        # positives are sub-vectors, query are their complements
        # shape: [batch_size, n_features - 1, rep_dim]
        positives, query = self.model(x.float())

        logit = self._cal_logit(query, positives)
        logit = logit.permute(0, 2, 1)  # [batch_size, n_pos, n_pos - 1]

        correct_class = torch.zeros(
            (logit.shape[0], logit.shape[2]), dtype=torch.long
        ).to(self.device)

        loss_n = F.cross_entropy(logit.float(), correct_class, reduction=loss_reduction)

        complement_of_logit = torch.where(logit == float("-inf"), logit, 1 - logit)
        loss_a = F.cross_entropy(
            complement_of_logit, correct_class, reduction=loss_reduction
        )

        return loss_n.float().mean(dim=1), loss_a.float().mean(dim=1)

    def _compute_mean_loss(
        self, loss_n: torch.Tensor, loss_a: torch.Tensor, labels: torch.Tensor = None
    ) -> torch.Tensor:
        """Compute the loss based on the training method.

        Parameters
        ----------
        loss_n : torch.Tensor
            The loss on the normal samples.
        loss_a : torch.Tensor
            The loss on the anomaly samples.
        labels : torch.Tensor, optional, default=None
            The ground truth labels.

        Returns
        -------
        loss : torch.Tensor
            The loss.
        """
        score = loss_n - loss_a

        if self.train_method == "blind":
            loss = loss_n.mean()
        elif self.train_method == "loe_hard":
            _, idx_n = torch.topk(
                score,
                int(score.shape[0] * (1 - self.contamination)),
                largest=False,
                sorted=False,
            )
            _, idx_a = torch.topk(
                score,
                int(score.shape[0] * self.contamination),
                largest=True,
                sorted=False,
            )
            loss = torch.cat([loss_n[idx_n], loss_a[idx_a]], 0).mean()
        elif self.train_method == "loe_soft":
            _, idx_n = torch.topk(
                score,
                int(score.shape[0] * (1 - self.contamination)),
                largest=False,
                sorted=False,
            )
            _, idx_a = torch.topk(
                score,
                int(score.shape[0] * self.contamination),
                largest=True,
                sorted=False,
            )
            loss = torch.cat(
                [loss_n[idx_n], 0.5 * loss_n[idx_a] + 0.5 * loss_a[idx_a]], 0
            ).mean()
        elif self.train_method == "refine":
            _, idx_n = torch.topk(
                loss_n,
                int(loss_n.shape[0] * (1 - self.contamination)),
                largest=False,
                sorted=False,
            )
            loss = loss_n[idx_n].mean()
        elif self.train_method == "gt" and labels is not None:
            loss = torch.cat([loss_n[labels == 0], loss_a[labels == 1]], 0).mean()
        else:
            raise ValueError(
                f"Unknown training method: {self.train_method}. "
                "Please choose from 'blind', 'loe_hard', 'loe_soft', 'refine', "
                "or 'gt'."
            )

        return loss

    def training_step(self, batch: dict, batch_idx: int) -> torch.Tensor:
        """Run training step."""
        loss_n, loss_a = self(batch, loss_reduction="none")
        loss = self._compute_mean_loss(loss_n, loss_a, labels=batch["target"].squeeze())

        self.log("train/loss", loss, prog_bar=True, sync_dist=True)
        return loss

    def validation_step(self, batch: dict, batch_idx: int) -> None:
        """Run validation step."""
        loss_n, loss_a = self(batch, loss_reduction="none")
        loss = self._compute_mean_loss(loss_n, loss_a, labels=batch["target"].squeeze())

        self.log("val/loss", loss, prog_bar=True, sync_dist=True)

        targets = batch["target"].squeeze()

        # get the predictions
        if self.contamination is None:
            self.contamination = targets.float().mean().item()

        scores = loss_n - loss_a
        threshold = torch.quantile(scores, q=(1 - self.contamination)).to(self.device)
        preds = (scores > threshold).long().ravel()

        # compute the metrics
        self.auroc(preds, targets)
        self.log("val/auroc", self.auroc, prog_bar=True, sync_dist=True)

        self.f1_score(preds, targets)
        self.log("val/f1_score", self.f1_score, prog_bar=True, sync_dist=True)

    def test_step(self, batch: dict, batch_idx: int) -> None:
        """Run test step."""
        targets = batch["target"].squeeze()

        loss_n, loss_a = self(batch, loss_reduction="none")
        loss = self._compute_mean_loss(loss_n, loss_a, labels=targets)

        self.log("test/loss", loss, prog_bar=True, sync_dist=True)

        # get the predictions
        if self.contamination is None:
            self.contamination = targets.float().mean().item()

        scores = loss_n - loss_a
        threshold = torch.quantile(scores, q=(1 - self.contamination)).to(self.device)
        preds = (scores > threshold).long().ravel()

        # compute the metrics
        self.auroc(preds, targets)
        self.log("test/auroc", self.auroc, prog_bar=True, sync_dist=True)

        self.f1_score(preds, targets)
        self.log("test/f1_score", self.f1_score, prog_bar=True, sync_dist=True)

    def configure_optimizers(self) -> torch.optim.Optimizer:
        """Configure the optimizer."""
        return torch.optim.Adam(
            self.model.parameters(), lr=self.hparams.lr, weight_decay=1e-5
        )

## Training

### ICL + LOE

In [None]:
%env "WANDB_NOTEBOOK_NAME" "ICL_LOE_BAF.ipynb"
%env WANDB_SILENT=True

In [None]:
for training_method in ["blind", "refine", "loe_hard", "loe_soft", "gt"]:
    model = ICLModule(
        n_features=(len(continuous_cols) + len(categorical_cols)),
        train_method=training_method,
        hidden_dims=(192, 96),
        rep_dim=128,
        bias=True,
        lr=1e-2,
    )

    trainer = Trainer(
        accelerator="auto",
        precision="16",
        benchmark=True,
        max_epochs=-1,
        log_every_n_steps=10,
        default_root_dir=os.path.join(CKPT_DIR, training_method),
        callbacks=[
            RichProgressBar(),
            ModelSummary(max_depth=5),
            EarlyStopping(monitor="val/loss", patience=25, stopping_threshold=1e-2),
            ModelCheckpoint(
                dirpath=CKPT_DIR,
                filename="ICL-BAF-{training_method}-{epoch:02d}",
                monitor="val/loss",
                save_top_k=3,
                mode="min",
            ),
        ],
        logger=WandbLogger(
            project="icl-baf",
            name=f"icl-baf-{training_method}",
            entity="vector-ssl-bootcamp",
            save_dir=CKPT_DIR,
        ),
    )

    trainer.fit(model, datamodule=data)
    trainer.test(model, dataloaders=data.test_dataloader())
    wandb.finish()


In [None]:
# trainer.fit(model, datamodule=data)
# wandb.finish()


### Baseline - XGBoost

In [None]:
def get_indices(adf: pd.DataFrame, month: np.ndarray):
    """Yield indices of the training and validation sets (split by month)."""
    for i in range(6):
        train_idx = adf[(month != i) & (month < 6)].index.values
        val_idx = adf[month == i].index.values
        yield train_idx, val_idx


In [None]:
for t_idx, v_idx in get_indices(df, month):
    print(len(t_idx), len(v_idx))


In [10]:
pos_prop = df[target_col].mean().values[0]


def objective(trial: optuna.Trial) -> float:
    params = {
        "max_depth": trial.suggest_int("max_depth", 1, 11),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, 100),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 50),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 0.7),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1e-2),
        "lambda": trial.suggest_float("lambda", 1, 4),
        "alpha": trial.suggest_float("alpha", 1e-4, 10),
    }

    params.update(
        {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "scale_pos_weight": (1 - pos_prop) / pos_prop,
        }
    )
    model = XGBClassifier(
        **params,
        random_state=SEED,
        early_stopping_rounds=20,
    )
    model.fit(
        train_df[continuous_cols + categorical_cols],
        train_df[target_col],
        eval_set=[(test_df[continuous_cols + categorical_cols], test_df[target_col])],
        verbose=False,
    )
    pred = model.predict_proba(test_df[continuous_cols + categorical_cols])[:, 1]
    auc = roc_auc_score(test_df[target_col], pred)
    return auc

In [11]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=SEED),
    load_if_exists=True,
)
study.optimize(objective, callbacks=[optuna.study.MaxTrialsCallback(25)])


[I 2023-09-21 14:19:13,020] A new study created in memory with name: no-name-511eaa94-fbdf-4b24-af75-92fdd0ad2908
[I 2023-09-21 14:20:22,573] Trial 0 finished with value: 0.893791366913425 and parameters: {'max_depth': 5, 'n_estimators': 1000, 'min_child_weight': 37, 'subsample': 0.7993292420985183, 'learning_rate': 0.10922148812330114, 'colsample_bylevel': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'gamma': 0.008661762795987894, 'lambda': 2.8033450352296265, 'alpha': 7.080754970702675}. Best is trial 0 with value: 0.893791366913425.
[I 2023-09-21 14:21:55,628] Trial 1 finished with value: 0.894775287060374 and parameters: {'max_depth': 1, 'n_estimators': 1000, 'min_child_weight': 42, 'subsample': 0.6061695553391381, 'learning_rate': 0.12728565879529838, 'colsample_bylevel': 0.5917022549267169, 'colsample_bytree': 0.6521211214797689, 'gamma': 0.005247569068758062, 'lambda': 2.2958350559263474, 'alpha': 2.9123622790663997}. Best is trial 1 with value: 0.894775287060374.

In [12]:
best_params = study.best_params
best_params.update(
    {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "scale_pos_weight": (1 - pos_prop) / pos_prop,
    }
)
best_params

{'max_depth': 3,
 'n_estimators': 500,
 'min_child_weight': 27,
 'subsample': 0.8708441534127347,
 'learning_rate': 0.11869637821439707,
 'colsample_bylevel': 0.659143125062404,
 'colsample_bytree': 0.677399475072992,
 'gamma': 0.0032616943531588046,
 'lambda': 2.231820237292188,
 'alpha': 2.8417706167222225,
 'objective': 'binary:logistic',
 'eval_metric': 'auc',
 'scale_pos_weight': 89.67005168192946}

In [13]:
base_model = XGBClassifier(**best_params, random_state=SEED, early_stopping_rounds=20)

base_model.fit(
    train_df[continuous_cols + categorical_cols],
    train_df[target_col],
    eval_set=[(test_df[continuous_cols + categorical_cols], test_df[target_col])],
    verbose=False,
)


In [14]:
prediction = base_model.predict_proba(test_df[continuous_cols + categorical_cols])[:, 1]
target = test_df[target_col]
auc_score = {"AUC": roc_auc_score(target, prediction)}
print(auc_score)


{'AUC': 0.8960114321002781}
