In [2]:
import os
os.environ["OMP_NUM_THREADS"] = "16"

import pandas as pd
import numpy as np
import torch
from functools import partial
import pytorch_lightning as pl
import warnings
import pickle
warnings.filterwarnings("ignore")

from torch.utils.data import DataLoader

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from tqdm.auto import tqdm
import lightgbm as ltb



# Part 1

In [2]:
dial_train = pd.read_parquet("dial_train.parquet")
dial_test = pd.read_parquet("dial_test.parquet")

In [9]:
from sklearn.decomposition import PCA


pca = PCA(n_components = 0.865)
reduced_train = pca.fit_transform(dial_train[['embedding']].apply(lambda x: x['embedding'], result_type="expand", axis=1))
reduced_test = pca.transform(dial_test[['embedding']].apply(lambda x: x['embedding'], result_type="expand", axis=1))

In [13]:
dial_train = pd.concat([dial_train[['client_id', 'event_time']], pd.DataFrame(reduced_train, columns=[f'emb_{i}' for i in range(reduced_train.shape[1])])], axis='columns')
dial_test = pd.concat([dial_test[['client_id', 'event_time']], pd.DataFrame(reduced_test, columns=[f'emb_{i}' for i in range(reduced_train.shape[1])])], axis='columns')

In [15]:
preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="event_time",
    event_time_transformation="dt_to_timestamp",
    cols_numerical=[f'emb_{i}' for i in range(reduced_train.shape[1])],
    return_records=False,
)

In [16]:
preprocessor = preprocessor.fit(dial_train)

In [17]:
with open('dial_preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

In [18]:
with open('dial_preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

In [19]:
processed_train = preprocessor.transform(dial_train)
processed_test = preprocessor.transform(dial_test)

In [20]:
processed_train.to_pickle('dial_processed_train.pkl')
processed_test.to_pickle('dial_processed_test.pkl')

# Part 2

In [3]:
processed_train = pd.read_pickle('dial_processed_train.pkl')
processed_test = pd.read_pickle('dial_processed_test.pkl')

In [4]:
train = MemoryMapDataset(
    data=processed_train.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=8),
        ISeqLenLimit(max_seq_len=4096),
        ToTorch()
    ]
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=8),
        ISeqLenLimit(max_seq_len=4096),
        ToTorch()
    ]
)

In [5]:
train_ds = ColesIterableDataset(
    data=train,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

valid_ds = ColesIterableDataset(
    data=test,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

In [6]:
train_dl = PtlsDataModule(
    train_data=train_ds,
    train_num_workers=16,
    train_batch_size=256,
    valid_data=valid_ds,
    valid_num_workers=16,
    valid_batch_size=256
)

In [8]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={f'emb_{i}': 'log' for i in range(8)},
)

In [9]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=64,
    type='gru',
)

In [10]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9025)
)

In [17]:
trainer = pl.Trainer(
    max_epochs=30,
    limit_val_batches=5000,
    # gpus=[0],
    enable_progress_bar=True,
    gradient_clip_val=0.5,
    logger=pl.loggers.TensorBoardLogger(
        save_dir='./logdir',
        name='dial_result'
    ),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
        pl.callbacks.EarlyStopping(monitor="valid/recall_top_k", mode="max", patience=5),
        pl.callbacks.EarlyStopping(monitor="loss", mode="min", patience=3),
    ]
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [18]:
trainer.fit(model, train_dl)


  | Name               | Type            | Params | Mode 
---------------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0      | train
1 | _seq_encoder       | RnnSeqEncoder   | 14.3 K | train
2 | _validation_metric | BatchRecallTopK | 0      | train
3 | _head              | Head            | 0      | train
---------------------------------------------------------------
14.3 K    Trainable params
0         Non-trainable params
14.3 K    Total params
0.057     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 145/145 [00:59<00:00,  2.43it/s, v_num=4, seq_len=12.00]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/32 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/32 [00:00<?, ?it/s][A
Validation DataLoader 0:   3%|▎         | 1/32 [00:00<00:27,  1.11it/s][A
Validation DataLoader 0:   6%|▋         | 2/32 [00:00<00:14,  2.06it/s][A
Validation DataLoader 0:   9%|▉         | 3/32 [00:01<00:10,  2.90it/s][A
Validation DataLoader 0:  12%|█▎        | 4/32 [00:01<00:07,  3.69it/s][A
Validation DataLoader 0:  16%|█▌        | 5/32 [00:01<00:06,  4.38it/s][A
Validation DataLoader 0:  19%|█▉        | 6/32 [00:01<00:05,  4.78it/s][A
Validation DataLoader 0:  22%|██▏       | 7/32 [00:01<00:04,  5.31it/s][A
Validation DataLoader 0:  25%|██▌       | 8/32 [00:01<00:04,  5.74it/s][A
Validation DataLoader 0:  28%|██▊       | 9/32 [00:01<00:03,  6.22it/s][A
Validation DataLoader 0:  31%|███▏      | 10/32 [00:01<00:03,  6.50it/s]

In [19]:
torch.save(model.state_dict(), './dial_emb64_model.pt')

# Part 3

In [None]:
processed_train = pd.read_pickle('dial_processed_train.pkl')
processed_test = pd.read_pickle('dial_processed_test.pkl')

In [None]:
target_train = pd.read_parquet("train_target.parquet")
processed_target = pd.read_pickle('processed_target.pkl')

In [None]:
target_test = pd.read_parquet("test_target_b.parquet")
processed_target_test = pd.read_pickle('processed_target_test.pkl')

In [None]:
with open('dial_preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

In [None]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={f'emb_{i}': 'log' for i in range(8)},
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=64,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9025)
)

model.load_state_dict(torch.load('./dial_emb64_model.pt'))
model.eval()

In [20]:
from pandas.tseries.offsets import MonthBegin


class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        months,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.months = months
        self._col_id = col_id
        self._col_time = col_time

    def __iter__(self):
        for rec in self._src:
            for i, month in enumerate(self.months):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()

                month_event_time = int((pd.to_datetime(month, yearfirst=True, dayfirst=False) - MonthBegin(1)).to_datetime64()) / 1e9
                mask = features[self._col_time] < month_event_time

                for key, tensor in features.items():
                    if key.startswith('target'):
                        features[key] = tensor[i].tolist()
                    elif key != self._col_id:
                        features[key] = tensor[mask]

                features[self._col_id] += '__' + str(month)

                yield features
                

from datetime import datetime


def collate_feature_dict_with_target(batch, col_id='client_id', targets=False):
    batch_ids = []
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        del sample[col_id]

        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']

    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, target_cols
    return padded_batch, batch_ids

def to_pandas(x):
    with torch.no_grad():
        expand_cols = []
        scalar_features = {}
        for k, v in x.items():
            if type(v) is torch.Tensor:
                v = v.cpu().detach().numpy()
            if type(v) is list or len(v.shape) == 1:
                scalar_features[k] = v
            elif len(v.shape) == 2:
                expand_cols.append(k)
            else:
                scalar_features[k] = None
        dataframes = [pd.DataFrame(scalar_features)]
        for col in expand_cols:
            v = x[col].cpu().detach().numpy()
            dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))
        return pd.concat(dataframes, axis=1)

In [None]:
train = MemoryMapDataset(
    data=processed_train.merge(processed_target.drop("event_time", axis=1), on="client_id", how="inner").to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        GetSplit(months=sorted(target_train.mon.unique())),
        ToTorch(),
    ]
)

inference_train_dl = DataLoader(
        dataset=train,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

In [77]:
val = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        GetSplit(months=sorted(target_test.mon.unique())),
        ToTorch(),
    ]
)
inference_val_dl = DataLoader(
        dataset=val,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

In [31]:
test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        ToTorch(),
    ]
)

inference_test_dl = DataLoader(
        dataset=test,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

In [32]:
from tqdm import tqdm

In [33]:
model.eval()

CoLESModule(
  (_loss): ContrastiveLoss()
  (_seq_encoder): RnnSeqEncoder(
    (trx_encoder): TrxEncoder(
      (embeddings): ModuleDict()
      (custom_embeddings): ModuleDict(
        (emb_0): LogScaler()
        (emb_1): LogScaler()
        (emb_2): LogScaler()
        (emb_3): LogScaler()
        (emb_4): LogScaler()
        (emb_5): LogScaler()
        (emb_6): LogScaler()
        (emb_7): LogScaler()
      )
      (custom_embedding_batch_norm): RBatchNorm(
        (bn): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (seq_encoder): RnnEncoder(
      (rnn): GRU(8, 64, batch_first=True)
      (reducer): LastStepEncoder()
    )
  )
  (_validation_metric): BatchRecallTopK()
  (_head): Head(
    (model): Sequential(
      (0): L2NormEncoder()
    )
  )
)

In [34]:
dfs = []
for x in tqdm(inference_train_dl):
    x_len = len(x)
    if x_len == 3:
        x, batch_ids, target_cols = x
    else:
        x, batch_ids = x
    out = model(x)
    if x_len == 3:
        target_cols = torch.tensor(target_cols)
        x_out = {
            'client_id': batch_ids,
            'target_1': target_cols[:, 0],
            'target_2': target_cols[:, 1],
            'target_3': target_cols[:, 2],
            'target_4': target_cols[:, 3],
            'emb': out
        }
    else:
        x_out = {
            'client_id': batch_ids,
            'emb': out
        }
    torch.cuda.empty_cache()

    dfs.append(to_pandas(x_out))


  0%|          | 0/12502 [00:00<?, ?it/s][A
  0%|          | 6/12502 [00:00<03:52, 53.84it/s][A
  0%|          | 12/12502 [00:00<03:49, 54.44it/s][A
  0%|          | 18/12502 [00:00<04:24, 47.28it/s][A
  0%|          | 23/12502 [00:00<04:19, 48.15it/s][A
  0%|          | 29/12502 [00:00<04:02, 51.34it/s][A
  0%|          | 35/12502 [00:00<03:59, 52.09it/s][A
  0%|          | 41/12502 [00:00<03:54, 53.15it/s][A
  0%|          | 47/12502 [00:00<03:56, 52.56it/s][A
  0%|          | 53/12502 [00:01<03:56, 52.58it/s][A
  0%|          | 59/12502 [00:01<03:51, 53.84it/s][A
  1%|          | 65/12502 [00:01<03:48, 54.38it/s][A
  1%|          | 71/12502 [00:01<03:52, 53.57it/s][A
  1%|          | 77/12502 [00:01<03:50, 53.99it/s][A
  1%|          | 83/12502 [00:01<03:50, 53.90it/s][A
  1%|          | 89/12502 [00:01<03:43, 55.54it/s][A
  1%|          | 95/12502 [00:01<03:41, 55.91it/s][A
  1%|          | 101/12502 [00:01<03:43, 55.38it/s][A
  1%|          | 107/12502 [00:01<03

In [35]:
train_emb_df = pd.concat(dfs, axis='rows')

In [36]:
train_emb_df[['client_id', 'month']] = train_emb_df['client_id'].str.split('__', n=1, expand=True)

In [37]:
train_emb_df.to_csv('train_dial_emb_v2.csv', index=False)

In [38]:
dfs = []
for x in tqdm(inference_test_dl):
    x_len = len(x)
    if x_len == 3:
        x, batch_ids, target_cols = x
    else:
        x, batch_ids = x
    model.eval()
    out = model(x)
    if x_len == 3:
        target_cols = torch.tensor(target_cols)
        x_out = {
            'client_id': batch_ids,
            'target_1': target_cols[:, 0],
            'target_2': target_cols[:, 1],
            'target_3': target_cols[:, 2],
            'target_4': target_cols[:, 3],
            'emb': out
        }
    else:
        x_out = {
            'client_id': batch_ids,
            'emb': out
        }
    torch.cuda.empty_cache()

    dfs.append(to_pandas(x_out))


  0%|          | 0/319 [00:00<?, ?it/s][A
  1%|▏         | 4/319 [00:00<00:08, 37.27it/s][A
  3%|▎         | 9/319 [00:00<00:07, 42.23it/s][A
  4%|▍         | 14/319 [00:00<00:06, 43.92it/s][A
  6%|▌         | 19/319 [00:00<00:06, 44.00it/s][A
  8%|▊         | 24/319 [00:00<00:06, 44.90it/s][A
  9%|▉         | 29/319 [00:00<00:06, 44.86it/s][A
 11%|█         | 34/319 [00:00<00:06, 45.16it/s][A
 12%|█▏        | 39/319 [00:00<00:06, 41.80it/s][A
 14%|█▍        | 44/319 [00:01<00:06, 41.03it/s][A
 15%|█▌        | 49/319 [00:01<00:06, 40.73it/s][A
 17%|█▋        | 54/319 [00:01<00:06, 40.89it/s][A
 18%|█▊        | 59/319 [00:01<00:06, 42.44it/s][A
 20%|██        | 64/319 [00:01<00:07, 36.18it/s][A
 22%|██▏       | 69/319 [00:01<00:06, 37.99it/s][A
 23%|██▎       | 74/319 [00:01<00:06, 38.87it/s][A
 25%|██▍       | 79/319 [00:01<00:05, 40.14it/s][A
 26%|██▋       | 84/319 [00:02<00:06, 38.84it/s][A
 28%|██▊       | 89/319 [00:02<00:05, 40.20it/s][A
 29%|██▉       | 94/31

In [39]:
test_emb_df = pd.concat(dfs, axis='rows')

In [41]:
test_emb_df.to_csv('test_dial_emb_v2.csv', index=False)

In [78]:
dfs = []
for x in tqdm(inference_val_dl):
    x_len = len(x)
    if x_len == 3:
        x, batch_ids, target_cols = x
    else:
        x, batch_ids = x
    out = model(x)
    if x_len == 3:
        target_cols = torch.tensor(target_cols)
        x_out = {
            'client_id': batch_ids,
            'target_1': target_cols[:, 0],
            'target_2': target_cols[:, 1],
            'target_3': target_cols[:, 2],
            'target_4': target_cols[:, 3],
            'emb': out
        }
    else:
        x_out = {
            'client_id': batch_ids,
            'emb': out
        }
    torch.cuda.empty_cache()

    dfs.append(to_pandas(x_out))


  0%|          | 0/3817 [00:00<?, ?it/s][A
  0%|          | 6/3817 [00:00<01:12, 52.41it/s][A
  0%|          | 12/3817 [00:00<01:13, 51.43it/s][A
  0%|          | 18/3817 [00:00<01:12, 52.75it/s][A
  1%|          | 25/3817 [00:00<01:07, 56.40it/s][A
  1%|          | 32/3817 [00:00<01:04, 58.46it/s][A
  1%|          | 38/3817 [00:00<01:05, 57.74it/s][A
  1%|          | 44/3817 [00:00<01:06, 56.38it/s][A
  1%|▏         | 50/3817 [00:00<01:05, 57.30it/s][A
  1%|▏         | 56/3817 [00:01<01:08, 54.85it/s][A
  2%|▏         | 62/3817 [00:01<01:08, 54.61it/s][A
  2%|▏         | 68/3817 [00:01<01:09, 54.14it/s][A
  2%|▏         | 74/3817 [00:01<01:08, 54.71it/s][A
  2%|▏         | 80/3817 [00:01<01:10, 53.08it/s][A
  2%|▏         | 86/3817 [00:01<01:08, 54.83it/s][A
  2%|▏         | 92/3817 [00:01<01:06, 56.17it/s][A
  3%|▎         | 99/3817 [00:01<01:03, 58.55it/s][A
  3%|▎         | 105/3817 [00:01<01:04, 57.81it/s][A
  3%|▎         | 112/3817 [00:01<01:02, 59.05it/s][A


In [86]:
val_emb_df = pd.concat(dfs, axis='rows')

In [87]:
val_emb_df[['client_id', 'month']] = val_emb_df['client_id'].str.split('__', n=1, expand=True)

In [88]:
val_emb_df.to_csv('val_dial_emb_v2.csv', index=False)