In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from copy import deepcopy
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from dataset.kdd import KDDDataset
from dataset.vocab import Vocabulary

In [2]:
nrows = None
timedelta_suffix = ""

In [3]:
config = "TabBERT"

# Setup

In [4]:
ts_representation = "as int"
quantize_num_cols = True
n_bins = 50
add_step_sep_token = False

In [5]:
load_path_pretrain = '../data/kdd/pkdd99_pretraining_post-encoding.csv'
load_path_finetune = '../data/kdd/pkdd99_finetuning_post-encoding.csv'

seq_len = 150 # NB: this is `t_max` in the UniTTab paper.
min_frequency = 0.
grouping_col = "account_id"
ts_col = "timestamp"
ordered_cols = ["amount_trans", "balance", "k_symbol", "operation", "type_trans", "Year", "Month", "Day", "weekday"]
init_categorical_indicator = [False, False, True, True, True, True, True, True, True]
delta_features = [
    {
        "name": "delta_days",
        "unit": "timedelta64[D]",
    },
]
label_col = "status"

# Load data

In [None]:
raw_df_pt = pd.read_csv(load_path_pretrain, nrows=nrows)
raw_df_pt["pretraining"] = 1
print(raw_df_pt.shape)
df_pt = raw_df_pt.copy(deep=True)

raw_df_ft = pd.read_csv(load_path_finetune, nrows=nrows)
raw_df_ft["pretraining"] = 0
print(raw_df_ft.shape)
df_ft = raw_df_ft.copy(deep=True)

raw_df = pd.concat([raw_df_pt, raw_df_ft])
print(raw_df.shape)
df = raw_df.copy(deep=True)

Handle timestamp representation

In [7]:
df[ts_col] = "19" + df["Year"].astype(str) + "-" + df["Month"].astype(str) + "-" + df["Day"].astype(str)
df[ts_col] = pd.to_datetime(df[ts_col], format="%Y-%m-%d")
df["date"] = df[ts_col]
df['weekday'] = df['date'].dt.dayofweek
df = df.drop(columns=["date"])

if "as int" == ts_representation:
    df[f"{ts_col}_int"] = df[ts_col].astype(int)
    min_max_scaler = MinMaxScaler()
    df[f"{ts_col}_int"] = min_max_scaler.fit_transform(df[f"{ts_col}_int"].to_numpy().reshape(-1, 1))
    ordered_cols += [f"{ts_col}_int"]
    init_categorical_indicator += [False]

if "as int and delta" == ts_representation:
    timedelta_suffix = "_timedelta"
    
    df[f"{ts_col}_int"] = df[ts_col].astype(int)
    min_max_scaler = MinMaxScaler()
    df[f"{ts_col}_int"] = min_max_scaler.fit_transform(df[f"{ts_col}_int"].to_numpy().reshape(-1, 1))
    df = df.drop(columns=["Year", "Month", "Day"])

    ordered_cols += [f"{ts_col}_int"]
    init_categorical_indicator += [False]

# Categories

Quantizing

In [None]:
if quantize_num_cols:
    for i, (c, cat) in enumerate(zip(ordered_cols, init_categorical_indicator)):
        if not cat and c not in [ts_col, "timedelta"]:
            print(c)
            quantizer = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="quantile", subsample=None)
            df[c] = quantizer.fit_transform(df[c].to_numpy().reshape(-1, 1))
            init_categorical_indicator[i] = True

Min frequency

In [9]:
if min_frequency > 0:
    for col, cat in zip(ordered_cols, init_categorical_indicator):
        if cat and col not in ["timedelta"]:
            print(col)
            series = df[col].value_counts()
            series_pct = (series / series.sum())
            infrequent_mask = series_pct < min_frequency
            # Replace infrequent categories by -1 
            df[col] = np.where(
                df[col].isin(series[infrequent_mask].index), 
                -1,
                df[col]
            )
            print(f"Nb of affected rows: {df[df[col] == -1].shape[0]}")

Categories ordinal encoding from 1 to n_category.
It was already done (but starting at 0 instead of 1)

In [None]:
N_SPECIAL_TOKENS = 7
vocab_start = 0
for col, cat in zip(ordered_cols, init_categorical_indicator):
    if cat:
        print(col)
        ordinal_enc = OrdinalEncoder()
        col_values = df[col].to_numpy().reshape(-1, 1).astype(str)
        df[col] = ordinal_enc.fit_transform(col_values)
        df[col] = df[col] + N_SPECIAL_TOKENS + vocab_start # Common vocab
        vocab_start = vocab_start + len(ordinal_enc.categories_[0])

In [None]:
df[:10]

In [None]:
# check the average number of transactions per client
df.groupby("account_id").count()["Day"].mean()

In [13]:
# check the max number of transactions per client
max_len = df.groupby("account_id").count()["Day"].max()
max_len

# Take the maximum length from now on
seq_len = max_len

# Make sequential dataset

Vocab

In [14]:
vocab = Vocabulary() 

file_name = '../data/kdd/vocab.nb'
vocab.filename = file_name

if "delta" in ts_representation:
    vocab.timedelta_colid = 6 # Last ordered col will be timedelta

vocab.set_field_keys([c for c in ordered_cols if c != "timedelta"])
all_vocab = []
for col in ordered_cols:
    if col != "timedelta":
        tokens = df[col].drop_duplicates().tolist()
        for t in tokens:
            vocab.set_id(t, col, return_local=False)

vocab.save_vocab(file_name)

Pretraining dataset

In [15]:
seq_df_pt = df[df["pretraining"] == 1]
seq_df_pt = seq_df_pt.sort_values(ts_col).groupby(grouping_col).head(seq_len)
if "delta" in ts_representation:
    seq_df_pt = seq_df_pt[ordered_cols + [ts_col, grouping_col]]
else:
    seq_df_pt = seq_df_pt[ordered_cols + [grouping_col]]

dataset_pt = []
labels_pt = []
for idx, group in seq_df_pt.groupby(grouping_col):

    if "delta" in ts_representation:
        group["timedelta"] = group[ts_col]
        ts_init = group[0:1][ts_col].values[0]
        group["timedelta"] = (group["timedelta"] - ts_init).dt.days
        dataset_pt.append(group[ordered_cols + ["timedelta"]].values)
        ncols = group[ordered_cols + ["timedelta"]].shape[1]

    else:
        dataset_pt.append(group[ordered_cols].values)
        ncols = group[ordered_cols].shape[1]
        
    labels_pt.append(-1)

In [None]:
seq_df_pt.isna().any()

In [17]:
kdd_pt = KDDDataset(samples=dataset_pt, targets=labels_pt, vocab=vocab, ncols=ncols, seq_len=seq_len, data=seq_df_pt, data_root=load_path_pretrain)

Fine tuning dataset (all transactions -> Last tmax handled by data collator)

In [18]:
seq_df_ft = df[df["pretraining"] == 0]
seq_df_ft = seq_df_ft.sort_values(ts_col).groupby(grouping_col).head(seq_len)
if "delta" in ts_representation:
    seq_df_ft = seq_df_ft[ordered_cols + [ts_col, grouping_col, label_col]]
else:
    seq_df_ft = seq_df_ft[ordered_cols + [grouping_col, label_col]]

dataset_ft = []
labels_ft = []
for idx, group in seq_df_ft.groupby(grouping_col):

    if "delta" in ts_representation:
        group["timedelta"] = group[ts_col]
        ts_init = group[0:1][ts_col].values[0]
        group["timedelta"] = (group["timedelta"] - ts_init).dt.days
        dataset_ft.append(group[ordered_cols + ["timedelta"]].values)
        ncols = group[ordered_cols + ["timedelta"]].shape[1]

    else:
        dataset_ft.append(group[ordered_cols].values)
        ncols = group[ordered_cols].shape[1]
        
    labels_ft.append(group[label_col].values[-1])

In [None]:
seq_df_ft.isna().any()

In [20]:
kdd_ft = KDDDataset(samples=dataset_ft, targets=labels_ft, vocab=vocab, ncols=ncols, seq_len=seq_len, data=seq_df_ft, data_root=load_path_finetune)

In [None]:
len([l for l in labels_ft if l == 1]) / len(labels_ft) # ~11%

Fine tuning dataset (tmax random transactions) -> Increase dataset size

In [22]:
tmax = 10
stride = tmax

seq_df_ft_rand = df[df["pretraining"] == 0]
seq_df_ft_rand = seq_df_ft_rand.sort_values(ts_col).groupby(grouping_col).head(seq_len)
if "delta" in ts_representation:
    seq_df_ft_rand = seq_df_ft_rand[ordered_cols + [ts_col, grouping_col, label_col]]
else:
    seq_df_ft_rand = seq_df_ft_rand[ordered_cols + [grouping_col, label_col]]

dataset_ft_rand = []
labels_ft_rand = []
for idx, group in seq_df_ft_rand.groupby(grouping_col):
    
    if group[ordered_cols].shape[0] < tmax:
        dataset_ft_rand.append(group[ordered_cols].values[-tmax:])
        labels_ft_rand.append(group[label_col].values[-1])
    else:
        max_len = group[ordered_cols].shape[0]
        ncols = group[ordered_cols].shape[1]
        n_examples = int(group[ordered_cols].shape[0] / tmax)
        for i in range(n_examples):
            dataset_ft_rand.append(group[ordered_cols][max_len-(i+1)*stride: max_len-i*stride].values)
            labels_ft_rand.append(group[label_col].values[-1])
assert len(labels_ft_rand) == len(dataset_ft_rand)

In [23]:
kdd_ft_rand = KDDDataset(samples=dataset_ft_rand, targets=labels_ft_rand, vocab=vocab, ncols=ncols, seq_len=seq_len, data=seq_df_ft, data_root=load_path_finetune)

In [None]:
len([l for l in labels_ft_rand if l == 1]) / len(labels_ft_rand) # ~10%

In [None]:
len(labels_ft_rand)

# Save 

In [26]:
save_path_pt = f"../data/kdd/KDDDataset_pt.pkl"

with open(save_path_pt, "wb") as f:
    pickle.dump(kdd_pt, f)

In [27]:
save_path_ft = f"../data/kdd/KDDDataset_ft.pkl"

with open(save_path_ft, "wb") as f:
    pickle.dump(kdd_ft, f)

In [28]:
save_path_ft = f"../data/kdd/KDDDataset_ft.pkl"

with open(save_path_ft, "wb") as f:
    pickle.dump(kdd_ft_rand, f)