In [2]:
import pandas as pd
import numpy as np
import torch

from utils.data import (
    get_data_period,
    read_data,
    print_info,
    print_info_targets,
    prepare_dataset,
    read_data
)
from ptls.data_load.datasets import PersistDataset
from catboost import CatBoostClassifier
from ptls.frames.inference_module import InferenceModule

import torch
import torch.nn as nn
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import FeatureDict
from ptls.data_load.datasets import MemoryMapDataset

import torch
import torchmetrics
import pytorch_lightning as ptl

from ptls.nn import TrxEncoder, TransformerSeqEncoder, Head, RnnSeqEncoder
from tqdm.auto import tqdm 
import polars as pl

from functools import partial
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule

import torch.nn as nn
from sklearn.metrics import f1_score

from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from functools import partial

from ptls.data_load.datasets import inference_data_loader

# Load train, test_data

In [None]:
train_records = prepare_dataset('full_train_zeros.csv')

In [None]:
test_records = prepare_dataset('test_answer.csv')

In [None]:
train_dataset = PersistDataset(
    data=train_records,
)

test_dataset = PersistDataset(
    data=test_records,
)

In [None]:
2 + 2

# Coles

In [None]:
train_dataset = PersistDataset(
    data=train_records,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
        embeddings={
            'quarter': {'in': 4, 'out': 8},
            'year': {'in': 23, 'out': 31}
        },
        numeric_values={
            'npo_sum': 'log',
        },
        embeddings_noise=0.001,
    ),
    hidden_size=32,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        train_dataset,
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=4,
    train_batch_size=256,
)

trainer = ptl.Trainer(
    max_epochs=3,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    log_every_n_steps=3,
)

print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

torch.save(seq_encoder.state_dict(), "coles-emb_final.pt")

In [None]:
def emb_inference(records, path_encoder):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(
                embeddings={
                    'quarter': {'in': 4, 'out': 8},
                    'year': {'in': 23, 'out': 31}
                },
                numeric_values={
                    'npo_sum': 'log',
                },
                embeddings_noise=0.001,
            ),
            hidden_size=32,
            type='gru',
    )

    seq_encoder.load_state_dict(torch.load(path_encoder, map_location=device))
    model = CoLESModule(seq_encoder)
    model.eval()
    trainer = ptl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
    train_dl = inference_data_loader(records, num_workers=0, batch_size=256)
    train_embeds = torch.vstack(trainer.predict(model, train_dl, ))
    
    return train_embeds

In [None]:
train_embeds = emb_inference(train_records, 'coles-emb_final.pt')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting DataLoader 0: : 5it [00:00, 69.21it/s] 



Predicting DataLoader 0: : 7011it [01:58, 59.13it/s]


In [21]:
test_embeds = emb_inference(test_records, 'coles-emb_final.pt')

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


Predicting DataLoader 0: : 178it [00:27,  6.48it/s]


In [13]:
def id_to_emb(records, emb, test=False):
    assert len(records) == len(emb), f'len of records must be equal to len of emb: {records.shape[0]}, {emb.shape[0]} '
    print(len(records))
    
    emb_dim = emb.shape[-1]
    res_dict = {
        'npo_account_id': [],
        'year': [],
        'quarter': [],
        'target_churn': []
    }
    res_dict.update({
        f'emb_{i}': [] for i in range(emb_dim)
    })
    
    if not test:
        res_dict['target_churn'] = []
    for i in tqdm(range(len(records))):
        #print(records[i])
        #Int32
        res_dict['npo_account_id'].append(records[i]['npo_account_id'])
        res_dict['year'].append(records[i]['tyear']  + 1999)
        res_dict['quarter'].append(records[i]['target_quarter'] + 1)
        if not test:
            res_dict['target_churn'].append(records[i]['target_churn'])
        for key in res_dict.keys():
            if 'emb_' in key:
                idx = int(key.split('_')[-1])
                res_dict[key].append(emb[i][idx])
        
    return pl.DataFrame(res_dict)

In [None]:
emb_train_features = id_to_emb(train_records, train_embeds)

1794648


 77%|███████▋  | 1389074/1794648 [15:01<03:23, 1993.31it/s] 

[0;31mKernelOutOfMemory[0m: Kernel ran out of memory and has been restarted. If the restart fails, restart the kernel from the Kernel menu.
If the error persists, try choosing a different configuration or optimizing your code.

In [None]:
emb_test_features = id_to_emb(test_records, train_embeds)

# Catboost emb only

In [None]:
model = CatBoostClassifier(iterations=1000,
                            learning_rate=0.01,
                            depth=6,
                            verbose=300,
                            random_seed=42,
                            eval_metric='F1',
                            task_type="GPU",
                        )

In [None]:
model.fit(emb_train_features.drop('target_curn'), )

## SeqToTarget

In [None]:
from sklearn.model_selection import train_test_split

train_dataset = PersistDataset(
    data=train_records,
)

valid_dataset = PersistDataset(
    data=train_records,
)

test_dataset = PersistDataset(
    data=train_records,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(
            embeddings={
                'quarter': {'in': 4, 'out': 8},
                'year': {'in': 19, 'out': 31}
            },
            numeric_values={
                'npo_sum': 'log',
            },
            embeddings_noise=0.001,
        ),
        hidden_size=32,
        type='gru',
)

sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=Head(input_size=seq_encoder.embedding_size, objective='classification', num_classes=2),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.F1Score(num_classes=2, average='macro'),
        optimizer_partial=partial(torch.optim.Adam),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=4, gamma=0.5),
)

sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(train_dataset, target_col_name='target_bin', target_dtype=torch.long),
    valid_data=SeqToTargetDataset(valid_dataset, target_col_name='target_bin', target_dtype=torch.long),
    test_data=SeqToTargetDataset(test_dataset, target_col_name='target_bin', target_dtype=torch.long),
    train_batch_size=512,
    valid_batch_size=128,
    train_num_workers=4,
)

trainer = pl.Trainer(
    max_epochs=5,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

print(f'logger.version = {trainer.logger.version}')
trainer.fit(sup_module, sup_data)

trainer.test(ckpt_path='best', dataloaders=sup_data.test_dataloader())

In [None]:
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule

In [None]:
inference_dl = torch.utils.data.DataLoader(
    dataset=dataset_test,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=1000,
    num_workers=4,
)

In [None]:
inf_module = InferenceModule(
    torch.nn.Sequential(
        sup_module,
        torch.nn.Softmax(dim=1),
    )
)

In [None]:
df_predict = trainer.predict(inf_module, inference_dl)

In [None]:
df_predict = pd.concat(df_predict, axis=0)