In [1]:
#!pip3 install pytorch-lifestream

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4"

import pandas as pd
import numpy as np
import torch
from functools import partial
import pytorch_lightning as pl
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import roc_auc_score

from torch.utils.data import DataLoader

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from catboost import CatBoostClassifier

from tqdm.auto import tqdm
import lightgbm as ltb

import gc

# Data preprocessing

In [2]:
transactions_train = pd.read_parquet("./Hackathon/trx_train.parquet")
#transactions_test = pd.read_parquet("./Hackathon/trx_test.parquet")

In [3]:
transactions_train.client_id.unique()

array(['51f88b707d9c8766cdbdfe799ba0fafd8f5b8034b08c46f0e2d0e2c2139b4b94',
       '522256f58adeaf13fbf098859fe5627f663ab586cb71e73f31ca2516062ef219',
       '522ff0793af20dff019519fddea3117f8858c9823ba36402914c3105d2370e6a',
       ...,
       '30f46a0b5caca4e9816dc93af79cd5662c416d81aa90069ebcb6b78d2d53accf',
       '30f6deab32f4b4710dd822551b0e6a3416fe2f807515321fd21be5a1842d46fe',
       '31084e097aab4237336f988ed5e667318409b4dc7afc04159caeb27a0433b455'],
      dtype=object)

In [4]:
thclients = transactions_train.client_id.unique()[:400000]

In [7]:
transactions_train = transactions_train[transactions_train['client_id'].isin(thclients)]

In [14]:
%%time

preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="event_time",
    event_time_transformation="dt_to_timestamp",
    cols_category=["event_type",
                   "event_subtype",
                   "currency",
                   "src_type11",
                   "src_type12",
                   "dst_type11",
                   "dst_type12",
                   "src_type21",
                   "src_type22",
                   "src_type31",
                   "src_type32"],
    cols_identity="amount",
    return_records=False,
)

processed_train = preprocessor.fit_transform(transactions_train)


CPU times: user 9min 13s, sys: 1min 6s, total: 10min 19s
Wall time: 10min 24s


In [10]:
target_train = pd.read_parquet("./Hackathon/train_target.parquet")

In [11]:
target_train = target_train[target_train['client_id'].isin(thclients)]

In [12]:
target_preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="mon",
    event_time_transformation="dt_to_timestamp",
    cols_identity=["target_1", "target_2", "target_3", "target_4"],
    return_records=False,
)

processed_target = target_preprocessor.fit_transform(target_train)

In [9]:
test_target_b = pd.read_parquet("./Hackathon/test_target_b.parquet")

In [11]:
del transactions_train, transactions_test, target_train
gc.collect()

0

**Обработка датасета:**

- Транзакции, у которых размер < min_seq_len выкидываются
- Транзакции, у которых длина > max_seq_len, обрезаются и конвертируются в torch.tensor
- Не нужные для CoLES фичи удаляются

In [15]:
train = MemoryMapDataset(
    data=processed_train.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=32),
        ISeqLenLimit(max_seq_len=4096),
        ToTorch()
    ]
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=32),
        ISeqLenLimit(max_seq_len=4096),
        ToTorch()
    ]
)

In [16]:
train_ds = ColesIterableDataset(
    data=train,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

valid_ds = ColesIterableDataset(
    data=test,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

In [17]:
train_dl = PtlsDataModule(
    train_data=train_ds,
    train_num_workers=8,
    train_batch_size=256,
    valid_data=valid_ds,
    valid_num_workers=8,
    valid_batch_size=256
)

# Model

- numeric_values обрабатываются как BatchNorm+Linear
- embedidngs - nn.Embedidngs

In [18]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount': 'log'},
    embeddings={
        "event_type": {'in': preprocessor.get_category_dictionary_sizes()["event_type"], "out": 24},
        "event_subtype": {'in': preprocessor.get_category_dictionary_sizes()["event_subtype"], "out": 24},
        'src_type11': {'in': preprocessor.get_category_dictionary_sizes()["src_type11"], 'out': 24},
        'src_type12': {'in': preprocessor.get_category_dictionary_sizes()["src_type12"], 'out': 24},
        'dst_type11': {'in': preprocessor.get_category_dictionary_sizes()["dst_type11"], 'out': 24},
        'dst_type12': {'in': preprocessor.get_category_dictionary_sizes()["dst_type12"], 'out': 24},
        'src_type22': {'in': preprocessor.get_category_dictionary_sizes()["src_type22"], 'out': 24},
        'src_type31': {'in': preprocessor.get_category_dictionary_sizes()["src_type31"], 'out': 24},
        'src_type32': {'in': preprocessor.get_category_dictionary_sizes()["src_type32"], 'out': 24},
      }
)

- **TrxEncoder** - обрабатывает каждую тразнакцию (строит для неё эмбеддиг)
- **SeqEncoder** - обрабатывает последовательность

In [19]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

In [20]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9025)
)

# Train

In [21]:
trainer = pl.Trainer(
    max_epochs=3,
    limit_val_batches=5000,
    #gpus=[0],
    enable_progress_bar=True,
    gradient_clip_val=0.5,
    logger=pl.loggers.TensorBoardLogger(
        save_dir='./logdir',
        name='baseline_result'
    ),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
    ]
)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
trainer.fit(model, train_dl)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 395 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
395 K     Trainable params
0         Non-trainable params
395 K     Total params
1.583     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
torch.save(model.state_dict(), './model400K.pt')

In [15]:
model.load_state_dict(torch.load('model400K.pt'))

<All keys matched successfully>

# Inference

Для каждого пользователя известно 12 таргетов, инференс происходит следующим образом:

Чтобы не происходило лика нужно для каждого клиента делать срез до текущего месяца:

Берутся все тразнакции за первый месяц, им соответствует 1-ый таргет из 12,
потом берутся транзакции за первый и второй месяц пользователя и им соотвествует 2-ой таргет и так далее.
То есть для данного пользователя, имеющего транзакции за год, мы можем получить 12 эмбеддингов, каждому из которых соответствует 1 таргет

In [48]:
class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        start_month,
        end_month,
        year=2022,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.start_month = start_month
        self.end_month = end_month
        self._year = year
        self._col_id = col_id
        self._col_time = col_time

    def __iter__(self):
        for rec in self._src:
            for month in range(self.start_month, self.end_month+1):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()

                if month == 12:
                    month_event_time = datetime(self._year + 1, 1, 1).timestamp()
                else:
                    month_event_time = datetime(self._year, month + 1, 1).timestamp()

                year_event_time = datetime(self._year, 1, 1).timestamp()

                mask = features[self._col_time] < month_event_time

                for key, tensor in features.items():
                    if key.startswith('target'):
                        features[key] = tensor[month - 1].tolist()
                    elif key != self._col_id:
                        features[key] = tensor[mask]

                features[self._col_id] += '_month=' + str(month)

                yield features

def collate_feature_dict_with_target(batch, col_id='client_id', col_time='event_time',  targets=False):
    batch_ids = []
    events=[]
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        events.append(sample[col_time].cpu().numpy())
        del sample[col_id]
        del sample[col_time]

        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']
    #print(type(batch_ids[0]), type(events[0]))
    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, events, target_cols
    return padded_batch, batch_ids


class InferenceModuleMultimodal(pl.LightningModule):
    def __init__(self, model, pandas_output=True, drop_seq_features=True, model_out_name='out'):
        super().__init__()

        self.model = model
        self.pandas_output = pandas_output
        self.drop_seq_features = drop_seq_features
        self.model_out_name = model_out_name

    def forward(self, x):
        x_len = len(x)
        if x_len == 4:
            x, batch_ids, events, target_cols = x
        else:
            x, batch_ids = x

        out = self.model(x)
        if x_len == 4:
            target_cols = torch.tensor(target_cols)
            #events = torch.tensor(events)
            x_out = {
                'client_id': batch_ids,
                #'event_time': events,
                'target_1': target_cols[:, 0],
                'target_2': target_cols[:, 1],
                'target_3': target_cols[:, 2],
                'target_4': target_cols[:, 3],
                self.model_out_name: out
            }
        else:
            x_out = {
                'client_id': batch_ids,
                self.model_out_name: out
            }
        torch.cuda.empty_cache()

        if self.pandas_output:
            return self.to_pandas(x_out)
        return x_out

    @staticmethod
    def to_pandas(x):
        expand_cols = []
        scalar_features = {}

        for k, v in x.items():
            if type(v) is torch.Tensor:
                v = v.cpu().numpy()

            if type(v) is list or len(v.shape) == 1:
                scalar_features[k] = v
            elif len(v.shape) == 2:
                expand_cols.append(k)
            else:
                scalar_features[k] = None

        dataframes = [pd.DataFrame(scalar_features)]
        for col in expand_cols:
            v = x[col].cpu().numpy()
            dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))

        return pd.concat(dataframes, axis=1)

In [49]:
%%time
from datetime import datetime
train = MemoryMapDataset(
    data=processed_train.merge(processed_target.drop("event_time", axis=1), on="client_id", how="inner").to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        GetSplit(start_month=1, end_month=12),
        ToTorch(),
    ]
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        ToTorch(),
    ]
)

CPU times: user 15.4 s, sys: 440 ms, total: 15.9 s
Wall time: 16 s


In [50]:
inference_train_dl = DataLoader(
        dataset=train,
        collate_fn=partial(collate_feature_dict_with_target, targets=True),
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

inference_test_dl = DataLoader(
        dataset=test,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

In [51]:
inf_module = InferenceModuleMultimodal(
        model=model,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name='emb',
    )

In [52]:
trainer = pl.Trainer( max_epochs=-1)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [31]:
inf_test_embeddings = pd.concat(
        trainer.predict(inf_module, inference_test_dl)
    )
inf_test_embeddings.to_parquet("test400.parquet", index=False, engine="pyarrow", compression="snappy")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [32]:
#del inf_test_embeddings

In [None]:
inf_train_embeddings = pd.concat(
        trainer.predict(inf_module, inference_train_dl)
    )

inf_train_embeddings.to_parquet("train400d.parquet", index=False, engine="pyarrow", compression="snappy")

In [None]:
#del inf_train_embeddings

In [None]:
del train, test
gc.collect()

In [None]:
del processed_train, processed_target

Файл **sample_submission** составляется из **client_id** файла **test_target_b**. Так как не у всех пользователей может быть транзакционная история, мы для простоты заполняем их фичи нулями.

In [None]:
not_only_trx = pd.DataFrame({"client_id": test_target_b["client_id"].unique()}).merge(inf_test_embeddings, how="left").fillna(0)
not_only_trx

In [None]:
not_only_trx.to_parquet("not_only_trx400.parquet", index=False, engine="pyarrow", compression="snappy")

# Downstream

Использование эмбеддингов для даунстрим задачи. Для всех таргетов одни и те же параметры бустинга для простоты

In [56]:
inf_train_embeddings

Unnamed: 0,client_id,target_1,target_2,target_3,target_4,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,...,emb_0246,emb_0247,emb_0248,emb_0249,emb_0250,emb_0251,emb_0252,emb_0253,emb_0254,emb_0255
0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-1.0,-0.019510,1.000000,0.004497,-0.999974,...,1.000000,-0.007282,1.0,-1.0,-0.105509,0.010038,-0.999584,1.0,-0.022442,0.999969
1,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-1.0,-0.019510,1.000000,0.004497,-0.981368,...,1.000000,-0.007282,1.0,-1.0,-0.105509,0.010037,-0.999574,1.0,-0.022442,0.999970
2,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-1.0,-0.019510,1.000000,0.004497,-0.997633,...,1.000000,-0.007282,1.0,-1.0,-0.105509,0.010037,-0.999569,1.0,-0.022442,0.999969
3,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-1.0,-0.019510,1.000000,0.004497,-0.999423,...,1.000000,-0.007282,1.0,-1.0,-0.105509,0.010037,-0.999578,1.0,-0.022442,0.999970
4,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-1.0,-0.019510,1.000000,0.004497,-0.967191,...,1.000000,-0.007282,1.0,-1.0,-0.105509,0.010037,-0.999587,1.0,-0.022442,0.999970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3621403,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,-1.0,-0.019513,0.247896,0.004500,-1.000000,...,0.999591,-0.007281,1.0,-1.0,-0.105563,0.010041,0.914832,1.0,-0.022445,-1.000000
3621404,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,-1.0,-0.019513,0.248020,0.004500,-1.000000,...,0.999703,-0.007281,1.0,-1.0,-0.105563,0.010041,0.914958,1.0,-0.022445,-1.000000
3621405,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,-1.0,-0.019513,0.248112,0.004500,-1.000000,...,0.999780,-0.007281,1.0,-1.0,-0.105563,0.010041,0.913898,1.0,-0.022445,-1.000000
3621406,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,-1.0,-0.019513,0.248176,0.004500,-1.000000,...,0.999828,-0.007281,1.0,-1.0,-0.105563,0.010041,0.914082,1.0,-0.022445,-1.000000


In [55]:
inf_train_embeddings = pd.read_parquet("train400.parquet")
test_embeddings_curr = pd.read_parquet("not_only_trx400.parquet").drop_duplicates('client_id')

In [5]:
from sklearn.model_selection import train_test_split

class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        #params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        #self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):
        

        

        #train_embeddings = pd.read_parquet(self.train_path)
        X = inf_train_embeddings.drop(columns=self.drop_feat)

        clfs = dict()
        for col_target in self.all_targets:
            y = inf_train_embeddings[col_target]
            X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=False,# stratify=y, 
                                                       train_size=0.8, 
                                                       random_state=33)
            clf = CatBoostClassifier(random_seed=42,
                                             #iterations=15000,
                                             #learning_rate=0.0081,
                                             #l2_leaf_reg=5,
                                             #depth=7,
                                             #max_bin=254,
                                             auto_class_weights='Balanced',
                                             task_type="GPU",
                                             thread_count=-1,
                                             use_best_model=True,
                                             #leaf_estimation_method='Newton', 
                                             #cat_features=cat_cols,
                                             #loss_function="CrossEntropy",
                                             eval_metric='AUC',
                                             #custom_loss=['Recall', 'F1', 'AUC']
                                    )
            
            clf.fit(X_train, y_train, eval_set=(X_val, y_val),
                    verbose=100, plot=False, early_stopping_rounds=200)
            print(f'Model fitted, target: {col_target}')
            clfs[col_target] = clf

        return clfs

    def get_scores(
        self,
        clfs
    ):
        scores = pd.DataFrame([])

        #test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)

        scores.to_csv(self.result_path)

        return scores


In [6]:
dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    #params=params,
    result_path='sample_mission.csv'
)

scores = dw.run()
scores

Learning rate set to 0.036162


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6382143	best: 0.6382143 (0)	total: 166ms	remaining: 2m 45s
100:	test: 0.6737270	best: 0.6737270 (100)	total: 9.7s	remaining: 1m 26s
200:	test: 0.6795122	best: 0.6795122 (200)	total: 18.7s	remaining: 1m 14s
300:	test: 0.6826550	best: 0.6826933 (296)	total: 27.6s	remaining: 1m 4s
400:	test: 0.6843536	best: 0.6843536 (400)	total: 36.4s	remaining: 54.4s
500:	test: 0.6855631	best: 0.6855740 (499)	total: 45.2s	remaining: 45s
600:	test: 0.6865346	best: 0.6865346 (600)	total: 54s	remaining: 35.8s
700:	test: 0.6873653	best: 0.6874046 (696)	total: 1m 2s	remaining: 26.8s
800:	test: 0.6880672	best: 0.6880672 (800)	total: 1m 11s	remaining: 17.8s
900:	test: 0.6885712	best: 0.6885712 (900)	total: 1m 20s	remaining: 8.86s
999:	test: 0.6886215	best: 0.6887863 (949)	total: 1m 29s	remaining: 0us
bestTest = 0.6887863278
bestIteration = 949
Shrink model to first 950 iterations.
Model fitted, target: target_1
Learning rate set to 0.036162


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7158065	best: 0.7158065 (0)	total: 100ms	remaining: 1m 40s
100:	test: 0.7767514	best: 0.7767514 (100)	total: 9.66s	remaining: 1m 25s
200:	test: 0.7817056	best: 0.7825336 (189)	total: 18.9s	remaining: 1m 15s
300:	test: 0.7850599	best: 0.7852061 (299)	total: 28s	remaining: 1m 4s
400:	test: 0.7880141	best: 0.7881591 (395)	total: 37.2s	remaining: 55.5s
500:	test: 0.7866227	best: 0.7885571 (468)	total: 46.1s	remaining: 46s
600:	test: 0.7872328	best: 0.7885571 (468)	total: 55.3s	remaining: 36.7s
bestTest = 0.7885570824
bestIteration = 468
Shrink model to first 469 iterations.
Model fitted, target: target_2
Learning rate set to 0.036162


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6514042	best: 0.6514042 (0)	total: 105ms	remaining: 1m 44s
100:	test: 0.6889180	best: 0.6889180 (100)	total: 9.55s	remaining: 1m 24s
200:	test: 0.6974066	best: 0.6974066 (200)	total: 18.7s	remaining: 1m 14s
300:	test: 0.7022016	best: 0.7022016 (300)	total: 27.8s	remaining: 1m 4s
400:	test: 0.7054254	best: 0.7054254 (400)	total: 36.8s	remaining: 54.9s
500:	test: 0.7075092	best: 0.7075417 (498)	total: 45.9s	remaining: 45.7s
600:	test: 0.7096848	best: 0.7096848 (600)	total: 54.9s	remaining: 36.5s
700:	test: 0.7110445	best: 0.7110518 (697)	total: 1m 3s	remaining: 27.3s
800:	test: 0.7122202	best: 0.7122202 (800)	total: 1m 12s	remaining: 18.1s
900:	test: 0.7133135	best: 0.7133194 (899)	total: 1m 21s	remaining: 9.01s
999:	test: 0.7142810	best: 0.7142835 (997)	total: 1m 30s	remaining: 0us
bestTest = 0.7142834961
bestIteration = 997
Shrink model to first 998 iterations.
Model fitted, target: target_3
Learning rate set to 0.036162


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6820045	best: 0.6820045 (0)	total: 114ms	remaining: 1m 54s
100:	test: 0.7563072	best: 0.7563072 (100)	total: 10.1s	remaining: 1m 29s
200:	test: 0.7724323	best: 0.7724323 (200)	total: 19.4s	remaining: 1m 17s
300:	test: 0.7805956	best: 0.7805956 (300)	total: 28.8s	remaining: 1m 6s
400:	test: 0.7856182	best: 0.7856182 (400)	total: 38s	remaining: 56.7s
500:	test: 0.7889003	best: 0.7889003 (500)	total: 47.1s	remaining: 46.9s
600:	test: 0.7913159	best: 0.7913222 (597)	total: 56.1s	remaining: 37.2s
700:	test: 0.7926520	best: 0.7926520 (700)	total: 1m 5s	remaining: 27.8s
800:	test: 0.7937609	best: 0.7937609 (800)	total: 1m 14s	remaining: 18.4s
900:	test: 0.7947455	best: 0.7947455 (900)	total: 1m 23s	remaining: 9.16s
999:	test: 0.7953898	best: 0.7954083 (998)	total: 1m 32s	remaining: 0us
bestTest = 0.7954083085
bestIteration = 998
Shrink model to first 999 iterations.
Model fitted, target: target_4


Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.047409,0.035093,0.869530,0.099730
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,0.628586,0.136453,0.658431,0.327251
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,0.000402,0.000002,0.000184,0.000012
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,0.539618,0.618211,0.679466,0.820355
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.482004,0.187162,0.366723,0.208676
...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,0.482929,0.012985,0.384750,0.029905
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.489651,0.107191,0.369920,0.662027
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.424455,0.483013,0.614976,0.251631
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,0.607237,0.346644,0.600772,0.792360


In [49]:
dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    #params=params,
    result_path='sample_mission.csv'
)

scores = dw.run()
scores

Learning rate set to 0.038205
0:	learn: 0.5959102	test: 0.5959735	best: 0.5959735 (0)	total: 210ms	remaining: 3m 30s
100:	learn: 0.0370369	test: 0.0374235	best: 0.0374235 (100)	total: 17.9s	remaining: 2m 39s
200:	learn: 0.0366216	test: 0.0371255	best: 0.0371255 (200)	total: 33.2s	remaining: 2m 11s
300:	learn: 0.0363699	test: 0.0369767	best: 0.0369767 (300)	total: 48.1s	remaining: 1m 51s
400:	learn: 0.0361791	test: 0.0368828	best: 0.0368828 (400)	total: 1m 2s	remaining: 1m 33s
500:	learn: 0.0360141	test: 0.0368122	best: 0.0368122 (500)	total: 1m 17s	remaining: 1m 16s
600:	learn: 0.0358738	test: 0.0367597	best: 0.0367597 (600)	total: 1m 31s	remaining: 1m
700:	learn: 0.0357422	test: 0.0367168	best: 0.0367168 (700)	total: 1m 45s	remaining: 45.2s
800:	learn: 0.0356179	test: 0.0366756	best: 0.0366756 (800)	total: 2m	remaining: 30s
900:	learn: 0.0355078	test: 0.0366462	best: 0.0366462 (900)	total: 2m 14s	remaining: 14.8s
999:	learn: 0.0354063	test: 0.0366229	best: 0.0366229 (999)	total: 2m 28

Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.001075,0.000085,0.064577,0.001261
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,0.011717,0.000890,0.005675,0.001740
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,0.055184,0.006200,0.051975,0.007174
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,0.011342,0.002557,0.009909,0.027817
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.005456,0.000266,0.003647,0.001139
...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,0.010084,0.000265,0.017453,0.002735
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.011705,0.000767,0.006996,0.012383
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.006908,0.000995,0.012473,0.001083
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,0.012377,0.001453,0.014222,0.026057


# GroupKFold

In [9]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        n_splits,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.n_splits = n_splits
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):

        n_splits = self.n_splits
        scores = []
        clfs = []

        gkf = GroupKFold(n_splits=n_splits)
        X = inf_train_embeddings.drop(columns=self.drop_feat)
        y = inf_train_embeddings[list(self.all_targets)]

        for train_index, val_index in gkf.split(X, y, inf_train_embeddings.client_id.values):
    
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        

            clf = CatBoostClassifier(random_seed=42,
                                             #iterations=150,
                                             #learning_rate=0.0081,
                                             #l2_leaf_reg=5,
                                             #depth=7,
                                             max_bin=254,
                                             #auto_class_weights='Balanced',
                                             task_type="GPU",
                                             thread_count=-1,
                                             use_best_model=True,
                                             #leaf_estimation_method='Newton', 
                                             #cat_features=cat_cols,
                                             loss_function='MultiLogloss',
                                             #eval_metric='Accuracy',
                                             #custom_metric=['AUC']
                                    )
            
            clf.fit(X_train, y_train, eval_set=(X_val, y_val),
                    verbose=100, plot=False, early_stopping_rounds=200)
            clfs.append(clf)

            print(roc_auc_score(y_val, clf.predict_proba(X_val), multi_class='ovo'))
            scores.append(roc_auc_score(y_val, clf.predict_proba(X_val), multi_class='ovo'))
        print(sum(scores) / n_splits)

        return clfs

    def get_scores(
        self,
        clfs
    ):
        scores = pd.DataFrame([])

        #test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        score = np.zeros((len(X_test), 4))
        for clf in clfs:
            score += clf.predict_proba(X_test) / self.n_splits
        scores = pd.concat([ids, pd.DataFrame(score,
                                             columns=['target_1',
                                                      'target_2',
                                                      'target_3',
                                                      'target_4'])], axis=1)

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)
        
        scores.to_csv(self.result_path)

        return scores

In [10]:
dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    n_splits=5,
    result_path='sample_Gmission400254.csv'
)

scores = dw.run()
scores

Learning rate set to 0.036162
0:	learn: 0.6017062	test: 0.6017277	best: 0.6017277 (0)	total: 393ms	remaining: 6m 32s
100:	learn: 0.0383840	test: 0.0391342	best: 0.0391342 (100)	total: 32.6s	remaining: 4m 50s
200:	learn: 0.0379660	test: 0.0387830	best: 0.0387830 (200)	total: 1m 1s	remaining: 4m 3s
300:	learn: 0.0377697	test: 0.0386326	best: 0.0386326 (300)	total: 1m 28s	remaining: 3m 24s
400:	learn: 0.0376204	test: 0.0385224	best: 0.0385224 (400)	total: 1m 54s	remaining: 2m 51s
500:	learn: 0.0374932	test: 0.0384363	best: 0.0384363 (500)	total: 2m 20s	remaining: 2m 20s
600:	learn: 0.0373912	test: 0.0383717	best: 0.0383717 (600)	total: 2m 46s	remaining: 1m 50s
700:	learn: 0.0373022	test: 0.0383221	best: 0.0383221 (700)	total: 3m 11s	remaining: 1m 21s
800:	learn: 0.0372220	test: 0.0382775	best: 0.0382775 (800)	total: 3m 36s	remaining: 53.8s
900:	learn: 0.0371419	test: 0.0382347	best: 0.0382347 (900)	total: 4m 2s	remaining: 26.6s
999:	learn: 0.0370701	test: 0.0381982	best: 0.0381982 (999)	t

KeyboardInterrupt: 

In [None]:
scores