In [2]:
import os

os.environ["OMP_NUM_THREADS"] = "16"

import pickle
import warnings
from functools import partial

import numpy as np 
import pandas as pd
import pytorch_lightning as pl
import torch

warnings.filterwarnings("ignore")

import lightgbm as ltb
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset
from ptls.data_load.utils import collate_feature_dict
from ptls.frames import PtlsDataModule
from ptls.frames.coles import ColesIterableDataset, CoLESModule
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.nn import RnnSeqEncoder, TrxEncoder
from ptls.preprocessing import PandasDataPreprocessor
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Part 1

In [None]:
transactions_train = pd.read_parquet("trx_train.parquet")
transactions_test = pd.read_parquet("trx_test.parquet")

In [None]:
with open("transactions_preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

In [None]:
processed_train = preprocessor.transform(transactions_train)
processed_test = preprocessor.transform(transactions_test)

IOStream.flush timed out


In [None]:
processed_train.to_pickle("processed_train.pkl")
processed_test.to_pickle("processed_test.pkl")

IOStream.flush timed out


# Part 2

In [None]:
processed_train = pd.read_pickle("processed_train.pkl")

In [None]:
processed_test = pd.read_pickle("processed_test.pkl")

In [6]:
with open("transactions_preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

In [None]:
train = MemoryMapDataset(
    data=processed_train.sample(500_000).to_dict("records"),
    i_filters=[
        FeatureFilter(
            drop_feature_names=[
                "client_id",
                "target_1",
                "target_2",
                "target_3",
                "target_4",
            ]
        ),
        SeqLenFilter(min_seq_len=128),
        ISeqLenLimit(max_seq_len=2896),
        ToTorch(),
    ],
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        FeatureFilter(
            drop_feature_names=[
                "client_id",
                "target_1",
                "target_2",
                "target_3",
                "target_4",
            ]
        ),
        SeqLenFilter(min_seq_len=128),
        ISeqLenLimit(max_seq_len=2896),
        ToTorch(),
    ],
)

In [None]:
train_ds = ColesIterableDataset(
    data=train, splitter=SampleSlices(split_count=5, cnt_min=32, cnt_max=160)
)

valid_ds = ColesIterableDataset(
    data=test, splitter=SampleSlices(split_count=5, cnt_min=32, cnt_max=160)
)

In [None]:
train_dl = PtlsDataModule(
    train_data=train_ds,
    train_num_workers=8,
    train_batch_size=256,
    valid_data=valid_ds,
    valid_num_workers=8,
    valid_batch_size=256,
)

In [None]:
import pickle

with open('train_dl.pkl', 'wb') as f:
    pickle.dump(train_dl, f)

In [3]:
torch.cuda.is_available()

True

In [4]:
import pickle

with open('train_dl.pkl', 'rb') as f:
    train_dl = pickle.load(f)

In [18]:
del train_dl

In [46]:
import gc
gc.collect()

213

In [7]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount": "log"},
    embeddings={
        "event_type": {
            "in": preprocessor.get_category_dictionary_sizes()["event_type"],
            "out": 8,
        },
        "event_subtype": {
            "in": preprocessor.get_category_dictionary_sizes()["event_subtype"],
            "out": 8,
        },
        "src_type11": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type11"],
            "out": 16,
        },
        "src_type12": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type12"],
            "out": 16,
        },
        "dst_type11": {
            "in": preprocessor.get_category_dictionary_sizes()["dst_type11"],
            "out": 16,
        },
        "dst_type12": {
            "in": preprocessor.get_category_dictionary_sizes()["dst_type12"],
            "out": 16,
        },
        "src_type22": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type22"],
            "out": 16
        },
        "src_type31": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type31"],
            "out": 16,
        },
        "src_type32": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type32"],
            "out": 16,
        },
    },
)

In [9]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=64,
    type="gru",
)

In [10]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(
        torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9025
    ),
)

In [11]:
trainer = pl.Trainer(
    max_epochs=30,
    limit_val_batches=5000,
    enable_progress_bar=True,
    gradient_clip_val=0.5,
    logger=pl.loggers.TensorBoardLogger(save_dir="./logdir", name="transaction_result"),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
        pl.callbacks.EarlyStopping(monitor="valid/recall_top_k", mode="max", patience=5),
        pl.callbacks.EarlyStopping(monitor="loss", mode="min", patience=3),
    ],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [12]:
torch.cuda.is_available()

True

In [13]:
trainer.fit(model, train_dl)

2024-06-16 09:25:08.594150: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 92.5 K | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
92.5 K    Trainable params
0         Non-trainable params
92.5 K    Total params
0.370     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 835/835 [02:14<00:00,  6.21it/s, v_num=7, seq_len=94.60]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/369 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/369 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/369 [00:00<00:53,  6.82it/s][A
Validation DataLoader 0:   1%|          | 2/369 [00:00<00:47,  7.76it/s][A
Validation DataLoader 0:   1%|          | 3/369 [00:00<00:55,  6.58it/s][A
Validation DataLoader 0:   1%|          | 4/369 [00:00<00:55,  6.60it/s][A
Validation DataLoader 0:   1%|▏         | 5/369 [00:00<00:56,  6.42it/s][A
Validation DataLoader 0:   2%|▏         | 6/369 [00:00<00:58,  6.22it/s][A
Validation DataLoader 0:   2%|▏         | 7/369 [00:01<00:55,  6.54it/s][A
Validation DataLoader 0:   2%|▏         | 8/369 [00:01<00:59,  6.11it/s][A
Validation DataLoader 0:   2%|▏         | 9/369 [00:01<01:01,  5.84it/s][A
Validation DataLoader 0:   3%|▎         | 10/369 [00:01<01:02

In [14]:
torch.save(model.state_dict(), "./transactions_emb64_model.pt")

# Part 3

In [None]:
processed_train = pd.read_pickle("processed_train.pkl")

In [None]:
# processed_target = pd.read_pickle('processed_target.pkl')
target_train = pd.read_parquet("train_target.parquet")
target_test = pd.read_parquet("test_target_b.parquet")

In [10]:
with open("transactions_preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

In [11]:
import gc
gc.collect()

162

In [12]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={"amount": "log"},
    embeddings={
        "event_type": {
            "in": preprocessor.get_category_dictionary_sizes()["event_type"],
            "out": 8,
        },
        "event_subtype": {
            "in": preprocessor.get_category_dictionary_sizes()["event_subtype"],
            "out": 8,
        },
        "src_type11": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type11"],
            "out": 16,
        },
        "src_type12": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type12"],
            "out": 16,
        },
        "dst_type11": {
            "in": preprocessor.get_category_dictionary_sizes()["dst_type11"],
            "out": 16,
        },
        "dst_type12": {
            "in": preprocessor.get_category_dictionary_sizes()["dst_type12"],
            "out": 16,
        },
        "src_type22": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type22"],
            "out": 16
        },
        "src_type31": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type31"],
            "out": 16,
        },
        "src_type32": {
            "in": preprocessor.get_category_dictionary_sizes()["src_type32"],
            "out": 16,
        },
    },
)

In [13]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=64,
    type="gru",
)

In [14]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(
        torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9025
    ),
)
model.load_state_dict(torch.load('./transactions_emb64_model.pt'))
model.eval()

CoLESModule(
  (_loss): ContrastiveLoss()
  (_seq_encoder): RnnSeqEncoder(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict(
        (event_type): NoisyEmbedding(
          57, 8, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (event_subtype): NoisyEmbedding(
          59, 8, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (src_type11): NoisyEmbedding(
          81, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (src_type12): NoisyEmbedding(
          347, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (dst_type11): NoisyEmbedding(
          83, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (dst_type12): NoisyEmbedding(
          409, 16, padding_idx=0
          (dropout): Dropout(p=0, inplace=False)
        )
        (src_type22): NoisyEmbedding(
          90, 16, padding_idx=0
          (drop

In [15]:
device  = torch.device("cuda")
model = model.to(device)

In [16]:
device

device(type='cuda')

In [17]:
from pandas.tseries.offsets import MonthBegin


class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        months,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.months = months
        self._col_id = col_id
        self._col_time = col_time

    def __iter__(self):
        for rec in self._src:
            for i, month in enumerate(self.months):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()

                month_event_time = int((pd.to_datetime(month, yearfirst=True, dayfirst=False) - MonthBegin(1)).to_datetime64()) / 1e9
                mask = features[self._col_time] < month_event_time

                for key, tensor in features.items():
                    if key.startswith('target'):
                        features[key] = tensor[i].tolist()
                    elif key != self._col_id:
                        features[key] = tensor[mask]

                features[self._col_id] += '__' + str(month)

                yield features
                

from datetime import datetime


def collate_feature_dict_with_target(batch, col_id='client_id', targets=False):
    batch_ids = []
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        del sample[col_id]

        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']

    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, target_cols
    return padded_batch, batch_ids

def to_pandas(x):
    with torch.no_grad():
        expand_cols = []
        scalar_features = {}
        for k, v in x.items():
            if type(v) is torch.Tensor:
                v = v.cpu().detach().numpy()
            if type(v) is list or len(v.shape) == 1:
                scalar_features[k] = v
            elif len(v.shape) == 2:
                expand_cols.append(k)
            else:
                scalar_features[k] = None
        dataframes = [pd.DataFrame(scalar_features)]
        for col in expand_cols:
            v = x[col].cpu().detach().numpy()
            dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))
        return pd.concat(dataframes, axis=1)

In [21]:
from tqdm import tqdm

def make_prediction(model, inference_dl):
    dfs = []
    for x in tqdm(inference_dl):
        x_len = len(x)
        if x_len == 3:
            x, batch_ids, target_cols = x
        else:
            x, batch_ids = x
        out = model(x.to(device))
        if x_len == 3:
            target_cols = torch.tensor(target_cols)
            x_out = {
                'client_id': batch_ids,
                'target_1': target_cols[:, 0],
                'target_2': target_cols[:, 1],
                'target_3': target_cols[:, 2],
                'target_4': target_cols[:, 3],
                'emb': out
            }
        else:
            x_out = {
                'client_id': batch_ids,
                'emb': out
            }
        torch.cuda.empty_cache()
        dfs.append(to_pandas(x_out))
    return pd.concat(dfs, axis='rows')


def get_train_dataset(processed_data, model, months):
    train = MemoryMapDataset(
        data=processed_data.to_dict("records"),
        i_filters=[
            ISeqLenLimit(max_seq_len=4096),
            FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
            GetSplit(months=months),
            ToTorch(),
        ]
    )

    inference_train_dl = DataLoader(
            dataset=train,
            collate_fn=collate_feature_dict_with_target,
            shuffle=False,
            num_workers=0,
            batch_size=256,
        )
    
    train_emb_df = make_prediction(model, inference_train_dl)
    train_emb_df[['client_id', 'month']] = train_emb_df['client_id'].str.split('__', n=1, expand=True)
    return train_emb_df


def get_val_dataset(processed_data, model, months):
    val = MemoryMapDataset(
        data=processed_data.to_dict("records"),
        i_filters=[
            ISeqLenLimit(max_seq_len=4096),
            FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
            GetSplit(months=months),
            ToTorch(),
        ]
    )
    inference_val_dl = DataLoader(
            dataset=val,
            collate_fn=collate_feature_dict_with_target,
            shuffle=False,
            num_workers=0,
            batch_size=256,
        )
    
    val_emb_df = make_prediction(model, inference_val_dl)
    val_emb_df[['client_id', 'month']] = val_emb_df['client_id'].str.split('__', n=1, expand=True)
    return val_emb_df


def get_test_dataset(processed_data, model):
    test = MemoryMapDataset(
        data=processed_data.to_dict("records"),
        i_filters=[
            ISeqLenLimit(max_seq_len=4096),
            FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
            ToTorch(),
        ]
    )

    inference_test_dl = DataLoader(
            dataset=test,
            collate_fn=collate_feature_dict_with_target,
            shuffle=False,
            num_workers=0,
            batch_size=256,
        )
    
    test_emb_df = make_prediction(model, inference_test_dl)
    return test_emb_df

In [19]:
part1 = processed_train[:150_000]

In [None]:
train_emb_df_part1 = get_train_dataset(part1, model, sorted(target_train.mon.sort_values().unique()))

In [None]:
1

In [None]:
train_emb_df_part1

In [None]:
train_emb_df_part1.to_csv('train1_trans_emb_v2.csv', index=False)

In [None]:
1

In [None]:
1

In [None]:
1

In [None]:
1

In [None]:
1

In [None]:
1

In [None]:
val_emb_df = get_val_dataset(processed_test, model, sorted(target_test.mon.sort_values().unique()))

100%|██████████| 9700/9700 [08:07<00:00, 19.90it/s]


In [42]:
val_emb_df.to_csv('val_trans_emb_v2.csv', index=False)

In [43]:
del val_emb_df

In [33]:
test_emb_df = get_test_dataset(processed_test, model)

100%|██████████| 882/882 [01:09<00:00, 12.71it/s]


In [35]:
test_emb_df.to_csv('test_trans_emb_v2.csv', index=False)

In [36]:
del test_emb_df

In [45]:
del processed_test