In [1]:
#!pip3 install pytorch-lifestream

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4"

import pandas as pd
import numpy as np
import torch
from functools import partial
import pytorch_lightning as pl
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import roc_auc_score

from torch.utils.data import DataLoader

from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing.iterable_seq_len_limit import ISeqLenLimit
from ptls.data_load.iterable_processing.to_torch_tensor import ToTorch
from ptls.data_load.iterable_processing.feature_filter import FeatureFilter
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesIterableDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.iterable_processing_dataset import IterableProcessingDataset

from catboost import CatBoostClassifier

from tqdm.auto import tqdm
import lightgbm as ltb

import gc

# Data preprocessing

In [2]:
transactions_train = pd.read_parquet("./Hackathon/trx_train.parquet")
#transactions_test = pd.read_parquet("./Hackathon/trx_test.parquet")

In [3]:
transactions_train.client_id.unique()

array(['51f88b707d9c8766cdbdfe799ba0fafd8f5b8034b08c46f0e2d0e2c2139b4b94',
       '522256f58adeaf13fbf098859fe5627f663ab586cb71e73f31ca2516062ef219',
       '522ff0793af20dff019519fddea3117f8858c9823ba36402914c3105d2370e6a',
       ...,
       '30f46a0b5caca4e9816dc93af79cd5662c416d81aa90069ebcb6b78d2d53accf',
       '30f6deab32f4b4710dd822551b0e6a3416fe2f807515321fd21be5a1842d46fe',
       '31084e097aab4237336f988ed5e667318409b4dc7afc04159caeb27a0433b455'],
      dtype=object)

In [4]:
thclients = transactions_train.client_id.unique()[:400000]

In [5]:
geo_train = pd.read_parquet("./Hackathon/geo_train.parquet")
geo_test = pd.read_parquet("./Hackathon/geo_test.parquet")

In [9]:
geo_train = geo_train[geo_train['client_id'].isin(thclients)]
geo_train

Unnamed: 0,client_id,event_time,geohash_4,geohash_5,geohash_6
0,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-27 09:56:36.271169,39879,144891,1959174
1,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-14 07:13:23.011804,39879,144891,1959174
2,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-02 07:46:18.278369,39879,144891,1959174
3,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-19 08:47:39.973788,39879,144891,1959174
4,309c0e909835757db908884e80f28f2fc76b803904b4b5...,2022-08-19 10:15:14.676360,39879,144891,1959174
...,...,...,...,...,...
275844629,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,2022-01-16 11:05:53.568095,5169,110999,1489248
275844630,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,2022-01-09 09:44:59.786908,5169,110999,1489248
275844631,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,2022-01-18 17:39:25.619824,5169,110999,1489248
275844632,fff9ca251a51dd97419ce3e1561310c1807fc235a5d55b...,2022-05-18 16:43:38.411767,17663,86427,2549402


In [10]:
del transactions_train

In [11]:
#transactions_test = transactions_test[transactions_test['client_id'].isin(thclients2)]

In [12]:
%%time

preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="event_time",
    event_time_transformation="dt_to_timestamp",
    cols_category=["geohash_4",
                   "geohash_5",
                   "geohash_6",
                   ],
    #cols_identity="amount",
    return_records=False,
)

processed_train = preprocessor.fit_transform(geo_train)

processed_test = preprocessor.transform(geo_test)

CPU times: user 13min 22s, sys: 1min 46s, total: 15min 8s
Wall time: 15min 28s


In [13]:
target_train = pd.read_parquet("./Hackathon/train_target.parquet")

In [15]:
target_preprocessor = PandasDataPreprocessor(
    col_id="client_id",
    col_event_time="mon",
    event_time_transformation="dt_to_timestamp",
    cols_identity=["target_1", "target_2", "target_3", "target_4"],
    return_records=False,
)

processed_target = target_preprocessor.fit_transform(target_train)

In [37]:
test_target_b = pd.read_parquet("./Hackathon/test_target_b.parquet")

In [16]:
del geo_train, geo_test, target_train
gc.collect()

0

**Обработка датасета:**

- Транзакции, у которых размер < min_seq_len выкидываются
- Транзакции, у которых длина > max_seq_len, обрезаются и конвертируются в torch.tensor
- Не нужные для CoLES фичи удаляются

In [15]:
train = MemoryMapDataset(
    data=processed_train.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=32),
        ISeqLenLimit(max_seq_len=4096),
        ToTorch()
    ]
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        FeatureFilter(drop_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        SeqLenFilter(min_seq_len=32),
        ISeqLenLimit(max_seq_len=4096),
        ToTorch()
    ]
)

In [16]:
train_ds = ColesIterableDataset(
    data=train,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

valid_ds = ColesIterableDataset(
    data=test,
    splitter=SampleSlices(
        split_count=5,
        cnt_min=32,
        cnt_max=180
    )
)

In [17]:
train_dl = PtlsDataModule(
    train_data=train_ds,
    train_num_workers=8,
    train_batch_size=256,
    valid_data=valid_ds,
    valid_num_workers=8,
    valid_batch_size=256
)

# Model

- numeric_values обрабатываются как BatchNorm+Linear
- embedidngs - nn.Embedidngs

In [17]:
trx_encoder_params = dict(
    embeddings_noise=0.003,
    #numeric_values={'amount': 'log'},
    embeddings={
        "geohash_4": {'in': preprocessor.get_category_dictionary_sizes()["geohash_4"], "out": 24},
        "geohash_5": {'in': preprocessor.get_category_dictionary_sizes()["geohash_5"], "out": 24},
        "geohash_6": {'in': preprocessor.get_category_dictionary_sizes()["geohash_6"], 'out': 24},
      }
)

- **TrxEncoder** - обрабатывает каждую тразнакцию (строит для неё эмбеддиг)
- **SeqEncoder** - обрабатывает последовательность

In [18]:
seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

In [19]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=3, gamma=0.9025)
)

# Train

In [21]:
trainer = pl.Trainer(
    max_epochs=20,
    limit_val_batches=5000,
    #gpus=[0],
    enable_progress_bar=True,
    gradient_clip_val=0.5,
    logger=pl.loggers.TensorBoardLogger(
        save_dir='./logdir',
        name='baseline_result'
    ),
    callbacks=[
        pl.callbacks.LearningRateMonitor(logging_interval='step'),
        pl.callbacks.ModelCheckpoint(every_n_train_steps=5000, save_top_k=-1),
    ]
)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_dl)

In [25]:
torch.save(model.state_dict(), './modelGEO400K.pt')

In [20]:
model.load_state_dict(torch.load('modelGEO400K.pt'))

<All keys matched successfully>

# Inference

Для каждого пользователя известно 12 таргетов, инференс происходит следующим образом:

Чтобы не происходило лика нужно для каждого клиента делать срез до текущего месяца:

Берутся все тразнакции за первый месяц, им соответствует 1-ый таргет из 12,
потом берутся транзакции за первый и второй месяц пользователя и им соотвествует 2-ой таргет и так далее.
То есть для данного пользователя, имеющего транзакции за год, мы можем получить 12 эмбеддингов, каждому из которых соответствует 1 таргет

In [21]:
class GetSplit(IterableProcessingDataset):
    def __init__(
        self,
        start_month,
        end_month,
        year=2022,
        col_id='client_id',
        col_time='event_time'
    ):
        super().__init__()
        self.start_month = start_month
        self.end_month = end_month
        self._year = year
        self._col_id = col_id
        self._col_time = col_time

    def __iter__(self):
        for rec in self._src:
            for month in range(self.start_month, self.end_month+1):
                features = rec[0] if type(rec) is tuple else rec
                features = features.copy()

                if month == 12:
                    month_event_time = datetime(self._year + 1, 1, 1).timestamp()
                else:
                    month_event_time = datetime(self._year, month + 1, 1).timestamp()

                year_event_time = datetime(self._year, 1, 1).timestamp()

                mask = features[self._col_time] < month_event_time

                for key, tensor in features.items():
                    if key.startswith('target'):
                        features[key] = tensor[month - 1].tolist()
                    elif key != self._col_id:
                        features[key] = tensor[mask]

                features[self._col_id] += '_month=' + str(month)

                yield features

def collate_feature_dict_with_target(batch, col_id='client_id', col_time='event_time',  targets=False):
    batch_ids = []
    events=[]
    target_cols = []
    for sample in batch:
        batch_ids.append(sample[col_id])
        events.append(sample[col_time])
        del sample[col_id]
        del sample[col_time]

        if targets:
            target_cols.append([sample[f'target_{i}'] for i in range(1, 5)])
            del sample['target_1']
            del sample['target_2']
            del sample['target_3']
            del sample['target_4']

    padded_batch = collate_feature_dict(batch)
    if targets:
        return padded_batch, batch_ids, events, target_cols
    return padded_batch, batch_ids


class InferenceModuleMultimodal(pl.LightningModule):
    def __init__(self, model, pandas_output=True, drop_seq_features=True, model_out_name='out'):
        super().__init__()

        self.model = model
        self.pandas_output = pandas_output
        self.drop_seq_features = drop_seq_features
        self.model_out_name = model_out_name

    def forward(self, x):
        x_len = len(x)
        if x_len == 4:
            x, batch_ids, events, target_cols = x
        else:
            x, batch_ids = x

        out = self.model(x)
        if x_len == 4:
            target_cols = torch.tensor(target_cols)
            x_out = {
                'client_id': batch_ids,
                #'event_time': events,
                'target_1': target_cols[:, 0],
                'target_2': target_cols[:, 1],
                'target_3': target_cols[:, 2],
                'target_4': target_cols[:, 3],
                self.model_out_name: out
            }
        else:
            x_out = {
                'client_id': batch_ids,
                self.model_out_name: out
            }
        torch.cuda.empty_cache()

        if self.pandas_output:
            return self.to_pandas(x_out)
        return x_out

    @staticmethod
    def to_pandas(x):
        expand_cols = []
        scalar_features = {}

        for k, v in x.items():
            if type(v) is torch.Tensor:
                v = v.cpu().numpy()

            if type(v) is list or len(v.shape) == 1:
                scalar_features[k] = v
            elif len(v.shape) == 2:
                expand_cols.append(k)
            else:
                scalar_features[k] = None

        dataframes = [pd.DataFrame(scalar_features)]
        for col in expand_cols:
            v = x[col].cpu().numpy()
            dataframes.append(pd.DataFrame(v, columns=[f'{col}_{i:04d}' for i in range(v.shape[1])]))

        return pd.concat(dataframes, axis=1)

In [22]:
%%time
from datetime import datetime
train = MemoryMapDataset(
    data=processed_train.merge(processed_target.drop("event_time", axis=1), on="client_id", how="inner").to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        GetSplit(start_month=1, end_month=12),
        ToTorch(),
    ]
)

test = MemoryMapDataset(
    data=processed_test.to_dict("records"),
    i_filters=[
        ISeqLenLimit(max_seq_len=4096),
        FeatureFilter(keep_feature_names=['client_id', 'target_1', 'target_2', 'target_3', 'target_4']),
        ToTorch(),
    ]
)

CPU times: user 6min 6s, sys: 31.1 s, total: 6min 37s
Wall time: 7min 7s


In [23]:
inference_train_dl = DataLoader(
        dataset=train,
        collate_fn=partial(collate_feature_dict_with_target, targets=True),
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

inference_test_dl = DataLoader(
        dataset=test,
        collate_fn=collate_feature_dict_with_target,
        shuffle=False,
        num_workers=0,
        batch_size=256,
    )

In [24]:
inf_module = InferenceModuleMultimodal(
        model=model,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name='emb',
    )

In [25]:
trainer = pl.Trainer( max_epochs=-1)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [31]:
inf_test_embeddings = pd.concat(
        trainer.predict(inf_module, inference_test_dl)
    )
inf_test_embeddings.to_parquet("test400GEO.parquet", index=False, engine="pyarrow", compression="snappy")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [32]:
#del inf_test_embeddings

In [26]:
inf_train_embeddings = pd.concat(
        trainer.predict(inf_module, inference_train_dl)
    )

inf_train_embeddings.to_parquet("train400GEO.parquet", index=False, engine="pyarrow", compression="snappy")

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [34]:
#del inf_train_embeddings

In [35]:
del train, test
gc.collect()

38

In [36]:
del processed_train, processed_target

Файл **sample_submission** составляется из **client_id** файла **test_target_b**. Так как не у всех пользователей может быть транзакционная история, мы для простоты заполняем их фичи нулями.

In [38]:
not_only_trx = pd.DataFrame({"client_id": test_target_b["client_id"].unique()}).merge(inf_test_embeddings, how="left").fillna(0)
not_only_trx

Unnamed: 0,client_id,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,emb_0005,emb_0006,emb_0007,emb_0008,...,emb_0246,emb_0247,emb_0248,emb_0249,emb_0250,emb_0251,emb_0252,emb_0253,emb_0254,emb_0255
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,-0.079782,-0.969492,-0.977693,0.003577,-0.592843,0.007499,0.035641,0.171470,-0.054329,...,0.758155,-0.859335,0.975766,-0.582328,0.828840,0.898814,-0.068011,-0.003235,0.985191,0.000064
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,0.021423,-0.806459,-0.901403,0.001507,0.429066,0.009927,0.065101,0.058099,-0.067164,...,0.446152,-0.791896,0.898147,-0.554653,0.893408,0.916473,-0.076498,-0.000056,0.937889,0.002231
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,-0.065214,-0.848756,-0.691887,0.010570,-0.313650,0.009247,0.036555,-0.039442,-0.056636,...,0.041754,-0.676718,0.155166,-0.168651,0.916164,-0.313950,-0.067496,-0.000176,0.985964,-0.024562
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,-0.019692,-0.914585,-0.952271,0.009216,0.532842,0.004078,0.033983,-0.039948,-0.056393,...,0.206245,-0.338506,0.285738,-0.147404,0.986450,0.892279,-0.070931,-0.028312,0.996237,0.002643
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,0.063879,-0.909083,0.098712,0.007632,0.111175,0.004575,0.003015,-0.199760,-0.057494,...,0.216408,-0.828551,0.587160,0.801591,0.975403,0.593659,-0.087078,-0.004982,0.999945,-0.066750


In [39]:
not_only_trx.to_parquet("not_only_trx400GEO.parquet", index=False, engine="pyarrow", compression="snappy")

# Downstream

Использование эмбеддингов для даунстрим задачи. Для всех таргетов одни и те же параметры бустинга для простоты

In [27]:
inf_train_embeddings

Unnamed: 0,client_id,target_1,target_2,target_3,target_4,emb_0000,emb_0001,emb_0002,emb_0003,emb_0004,...,emb_0246,emb_0247,emb_0248,emb_0249,emb_0250,emb_0251,emb_0252,emb_0253,emb_0254,emb_0255
0,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-0.010821,-0.358713,-0.373892,0.002149,0.055396,...,-0.039642,-0.100953,0.191086,-0.042847,0.312214,0.340004,-0.056945,0.002506,0.438022,0.003930
1,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-0.011288,-0.359721,-0.373101,0.002154,0.056883,...,-0.040242,-0.101397,0.193064,-0.040120,0.313504,0.337553,-0.056971,0.002423,0.438967,0.004521
2,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-0.011288,-0.359721,-0.373101,0.002154,0.056883,...,-0.040242,-0.101397,0.193064,-0.040120,0.313504,0.337553,-0.056971,0.002423,0.438967,0.004521
3,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-0.011288,-0.359721,-0.373101,0.002154,0.056883,...,-0.040242,-0.101397,0.193064,-0.040120,0.313504,0.337553,-0.056971,0.002423,0.438967,0.004521
4,000006265d27d1166ed67506682be7380007a5bead4362...,0,0,0,0,-0.011288,-0.359721,-0.373101,0.002154,0.056883,...,-0.040242,-0.101397,0.193064,-0.040120,0.313504,0.337553,-0.056971,0.002423,0.438967,0.004521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,0.019133,-0.527639,-0.596632,0.004052,-0.153703,...,-0.031106,-0.465766,0.647616,-0.378529,0.543042,0.598562,-0.067489,0.004948,0.648541,0.002145
124,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,0.011964,-0.533997,-0.607840,0.004156,-0.121165,...,-0.028032,-0.463154,0.658540,-0.395734,0.508318,0.607608,-0.068555,0.005112,0.659747,0.002141
125,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,-0.007838,-0.555933,-0.630796,0.004767,-0.066078,...,-0.073518,-0.412867,0.700627,-0.374158,0.402481,0.634845,-0.071746,0.005421,0.693440,0.002322
126,ffffda41a92ae10c8ae3920828129ef09c1517ba7c74cb...,0,0,0,0,-0.005955,-0.560003,-0.632798,0.004930,-0.059928,...,-0.093663,-0.414656,0.708307,-0.357971,0.365867,0.639912,-0.072080,0.005424,0.699048,0.002421


In [2]:
inf_train_embeddings = pd.read_parquet("train400GEO.parquet")
test_embeddings_curr = pd.read_parquet("not_only_trx400GEO.parquet").drop_duplicates('client_id')

In [3]:
from sklearn.model_selection import train_test_split

class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        #params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        #self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):
        

        

        #train_embeddings = pd.read_parquet(self.train_path)
        X = inf_train_embeddings.drop(columns=self.drop_feat)

        clfs = dict()
        for col_target in self.all_targets:
            y = inf_train_embeddings[col_target]
            X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=False,# stratify=y, 
                                                       train_size=0.8, 
                                                       random_state=33)
            clf = CatBoostClassifier(random_seed=42,
                                             #iterations=15000,
                                             #learning_rate=0.0081,
                                             #l2_leaf_reg=5,
                                             #depth=7,
                                             #max_bin=254,
                                             auto_class_weights='Balanced',
                                             task_type="GPU",
                                             thread_count=-1,
                                             use_best_model=True,
                                             #leaf_estimation_method='Newton', 
                                             #cat_features=cat_cols,
                                             #loss_function="CrossEntropy",
                                             eval_metric='AUC',
                                             #custom_loss=['Recall', 'F1', 'AUC']
                                    )
            
            clf.fit(X_train, y_train, eval_set=(X_val, y_val),
                    verbose=100, plot=False, early_stopping_rounds=200)
            print(f'Model fitted, target: {col_target}')
            clfs[col_target] = clf

        return clfs

    def get_scores(
        self,
        clfs
    ):
        scores = pd.DataFrame([])

        #test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        scores[self.col_id] = ids

        for col_target in self.all_targets:
            clf = clfs[col_target]
            score = clf.predict_proba(X_test)[:, 1]
            scores[col_target] = score

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)

        scores.to_csv(self.result_path)

        return scores


In [4]:
dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    #params=params,
    result_path='sample_Gmission.csv'
)

scores = dw.run()
scores

Learning rate set to 0.037292


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5303023	best: 0.5303023 (0)	total: 201ms	remaining: 3m 20s
100:	test: 0.5537325	best: 0.5537325 (100)	total: 7.06s	remaining: 1m 2s
200:	test: 0.5557070	best: 0.5559146 (164)	total: 13.7s	remaining: 54.5s
300:	test: 0.5577069	best: 0.5579054 (288)	total: 20.4s	remaining: 47.3s
400:	test: 0.5583579	best: 0.5583579 (400)	total: 27.2s	remaining: 40.6s
500:	test: 0.5588308	best: 0.5590585 (454)	total: 34s	remaining: 33.8s
600:	test: 0.5590307	best: 0.5592566 (593)	total: 40.7s	remaining: 27s
700:	test: 0.5592971	best: 0.5594791 (628)	total: 47.6s	remaining: 20.3s
800:	test: 0.5592997	best: 0.5596431 (776)	total: 54.4s	remaining: 13.5s
900:	test: 0.5590173	best: 0.5599610 (835)	total: 1m 1s	remaining: 6.72s
999:	test: 0.5584828	best: 0.5599610 (835)	total: 1m 8s	remaining: 0us
bestTest = 0.5599609911
bestIteration = 835
Shrink model to first 836 iterations.
Model fitted, target: target_1
Learning rate set to 0.037292


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5487081	best: 0.5487081 (0)	total: 84.5ms	remaining: 1m 24s
100:	test: 0.5614509	best: 0.5730429 (40)	total: 7.25s	remaining: 1m 4s
200:	test: 0.5646642	best: 0.5730429 (40)	total: 14.3s	remaining: 56.8s
bestTest = 0.5730428994
bestIteration = 40
Shrink model to first 41 iterations.
Model fitted, target: target_2
Learning rate set to 0.037292


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5197283	best: 0.5197283 (0)	total: 69.7ms	remaining: 1m 9s
100:	test: 0.5404677	best: 0.5409179 (95)	total: 6.97s	remaining: 1m 2s
200:	test: 0.5427166	best: 0.5427389 (181)	total: 13.8s	remaining: 54.9s
300:	test: 0.5428427	best: 0.5432316 (206)	total: 20.6s	remaining: 47.8s
400:	test: 0.5436396	best: 0.5436503 (399)	total: 27.3s	remaining: 40.7s
500:	test: 0.5437832	best: 0.5441307 (496)	total: 34s	remaining: 33.9s
600:	test: 0.5443608	best: 0.5446978 (556)	total: 40.9s	remaining: 27.2s
700:	test: 0.5451363	best: 0.5451390 (698)	total: 47.8s	remaining: 20.4s
800:	test: 0.5463372	best: 0.5464424 (788)	total: 54.7s	remaining: 13.6s
900:	test: 0.5453271	best: 0.5467579 (807)	total: 1m 1s	remaining: 6.76s
999:	test: 0.5452418	best: 0.5467579 (807)	total: 1m 8s	remaining: 0us
bestTest = 0.5467579365
bestIteration = 807
Shrink model to first 808 iterations.
Model fitted, target: target_3
Learning rate set to 0.037292


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5347612	best: 0.5347612 (0)	total: 85ms	remaining: 1m 24s
100:	test: 0.5613472	best: 0.5613472 (100)	total: 7.1s	remaining: 1m 3s
200:	test: 0.5657692	best: 0.5657692 (200)	total: 13.9s	remaining: 55.2s
300:	test: 0.5664001	best: 0.5669484 (253)	total: 20.7s	remaining: 48s
400:	test: 0.5678879	best: 0.5678879 (400)	total: 27.6s	remaining: 41.3s
500:	test: 0.5685670	best: 0.5691612 (463)	total: 34.5s	remaining: 34.4s
600:	test: 0.5693790	best: 0.5695400 (552)	total: 41.4s	remaining: 27.5s
700:	test: 0.5700967	best: 0.5705000 (644)	total: 48.3s	remaining: 20.6s
800:	test: 0.5692453	best: 0.5705000 (644)	total: 55.3s	remaining: 13.7s
bestTest = 0.5705000162
bestIteration = 644
Shrink model to first 645 iterations.
Model fitted, target: target_4


Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.036147,0.421306,0.082227,0.009956
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,0.503002,0.489788,0.484463,0.430997
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,0.493568,0.281766,0.495716,0.577000
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,0.427584,0.504482,0.459869,0.452559
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.036147,0.421306,0.082227,0.009956
...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,0.443274,0.412949,0.611001,0.581257
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.036147,0.421306,0.082227,0.009956
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.036147,0.421306,0.082227,0.009956
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,0.464729,0.498375,0.358304,0.541715


In [48]:
from sklearn.metrics import roc_auc_score

class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        #params,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        #self.params = params
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):

        #train_embeddings = pd.read_parquet(self.train_path)
        #X_train = train_embeddings.drop(columns=self.drop_feat)
        X = inf_train_embeddings.drop(columns=self.drop_feat)
        y = inf_train_embeddings[list(self.all_targets)]
        X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=False,# stratify=y, 
                                                       train_size=0.8, 
                                                       random_state=33)

        clf = CatBoostClassifier(random_seed=42,
                                             #iterations=15000,
                                             #learning_rate=0.0081,
                                             #l2_leaf_reg=5,
                                             #depth=7,
                                             max_bin=254,
                                             #auto_class_weights='Balanced',
                                             task_type="GPU",
                                             thread_count=-1,
                                             use_best_model=True,
                                             #leaf_estimation_method='Newton', 
                                             #cat_features=cat_cols,
                                             loss_function='MultiLogloss',
                                             #eval_metric='Accuracy',
                                             #custom_metric=['AUC']
                                    )
            
        clf.fit(X_train, y_train, eval_set=(X_val, y_val),
                    verbose=100, plot=False, early_stopping_rounds=200)

        print(roc_auc_score(y_val, clf.predict_proba(X_val), multi_class='ovo'))

        return clf

    def get_scores(
        self,
        clf
    ):
        scores = pd.DataFrame([])

        #test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]

        score = clf.predict_proba(X_test)
        scores = pd.concat([ids, pd.DataFrame(score,
                                             columns=['target_1',
                                                      'target_2',
                                                      'target_3',
                                                      'target_4'])], axis=1)

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)
        
        scores.to_csv(self.result_path)

        return scores

In [49]:
dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    #params=params,
    result_path='sample_mission.csv'
)

scores = dw.run()
scores

Learning rate set to 0.038205
0:	learn: 0.5959102	test: 0.5959735	best: 0.5959735 (0)	total: 210ms	remaining: 3m 30s
100:	learn: 0.0370369	test: 0.0374235	best: 0.0374235 (100)	total: 17.9s	remaining: 2m 39s
200:	learn: 0.0366216	test: 0.0371255	best: 0.0371255 (200)	total: 33.2s	remaining: 2m 11s
300:	learn: 0.0363699	test: 0.0369767	best: 0.0369767 (300)	total: 48.1s	remaining: 1m 51s
400:	learn: 0.0361791	test: 0.0368828	best: 0.0368828 (400)	total: 1m 2s	remaining: 1m 33s
500:	learn: 0.0360141	test: 0.0368122	best: 0.0368122 (500)	total: 1m 17s	remaining: 1m 16s
600:	learn: 0.0358738	test: 0.0367597	best: 0.0367597 (600)	total: 1m 31s	remaining: 1m
700:	learn: 0.0357422	test: 0.0367168	best: 0.0367168 (700)	total: 1m 45s	remaining: 45.2s
800:	learn: 0.0356179	test: 0.0366756	best: 0.0366756 (800)	total: 2m	remaining: 30s
900:	learn: 0.0355078	test: 0.0366462	best: 0.0366462 (900)	total: 2m 14s	remaining: 14.8s
999:	learn: 0.0354063	test: 0.0366229	best: 0.0366229 (999)	total: 2m 28

Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.001075,0.000085,0.064577,0.001261
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,0.011717,0.000890,0.005675,0.001740
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,0.055184,0.006200,0.051975,0.007174
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,0.011342,0.002557,0.009909,0.027817
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.005456,0.000266,0.003647,0.001139
...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,0.010084,0.000265,0.017453,0.002735
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.011705,0.000767,0.006996,0.012383
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.006908,0.000995,0.012473,0.001083
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,0.012377,0.001453,0.014222,0.026057


# GroupKFold

In [5]:
class Downstream:
    def __init__(
        self,
        train_path,
        test_path,
        n_splits,
        result_path,
        col_id='client_id',
        targets=(
            'target_1',
            'target_2',
            'target_3',
            'target_4'
        )
    ):
        self.train_path = train_path
        self.test_path = test_path

        self.col_id = col_id
        self.all_targets = targets
        self.n_splits = n_splits
        self.result_path = result_path
        self.drop_feat = list(self.all_targets) + [self.col_id]

    def fit(self):

        n_splits = self.n_splits
        scores = []
        clfs = []

        gkf = GroupKFold(n_splits=n_splits)
        X = inf_train_embeddings.drop(columns=self.drop_feat)
        y = inf_train_embeddings[list(self.all_targets)]

        for train_index, val_index in gkf.split(X, y, inf_train_embeddings.client_id.values):
    
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        

            clf = CatBoostClassifier(random_seed=42,
                                             #iterations=150,
                                             #learning_rate=0.0081,
                                             #l2_leaf_reg=5,
                                             #depth=7,
                                             max_bin=254,
                                             #auto_class_weights='Balanced',
                                             task_type="GPU",
                                             thread_count=-1,
                                             use_best_model=True,
                                             #leaf_estimation_method='Newton', 
                                             #cat_features=cat_cols,
                                             loss_function='MultiLogloss',
                                             #eval_metric='Accuracy',
                                             #custom_metric=['AUC']
                                    )
            
            clf.fit(X_train, y_train, eval_set=(X_val, y_val),
                    verbose=100, plot=False, early_stopping_rounds=200)
            clfs.append(clf)

            print(roc_auc_score(y_val, clf.predict_proba(X_val), multi_class='ovo'))
            scores.append(roc_auc_score(y_val, clf.predict_proba(X_val), multi_class='ovo'))
        print(sum(scores) / n_splits)

        return clfs

    def get_scores(
        self,
        clfs
    ):
        scores = pd.DataFrame([])

        #test_embeddings_curr = pd.read_parquet(self.test_path).drop_duplicates('client_id')
        X_test = test_embeddings_curr.drop(columns=[self.col_id])
        ids = test_embeddings_curr[self.col_id]
        score = np.zeros((len(X_test), 4))
        for clf in clfs:
            score += clf.predict_proba(X_test) / self.n_splits
        scores = pd.concat([ids, pd.DataFrame(score,
                                             columns=['target_1',
                                                      'target_2',
                                                      'target_3',
                                                      'target_4'])], axis=1)

        return scores

    def run(self):
        clfs = self.fit()
        scores = self.get_scores(clfs)
        
        scores.to_csv(self.result_path)

        return scores

In [6]:
dw = Downstream(
    train_path="train.parquet",
    test_path="not_only_trx.parquet",
    n_splits=5,
    result_path='sample_GmissionML.csv'
)

scores = dw.run()
scores

Learning rate set to 0.037292
0:	learn: 0.6041587	test: 0.6041577	best: 0.6041577 (0)	total: 304ms	remaining: 5m 3s
100:	learn: 0.0447667	test: 0.0447691	best: 0.0447691 (100)	total: 21s	remaining: 3m 6s
200:	learn: 0.0446100	test: 0.0446847	best: 0.0446847 (199)	total: 40.2s	remaining: 2m 39s
300:	learn: 0.0445140	test: 0.0446588	best: 0.0446588 (300)	total: 59s	remaining: 2m 17s
400:	learn: 0.0444317	test: 0.0446419	best: 0.0446419 (399)	total: 1m 16s	remaining: 1m 55s
500:	learn: 0.0443528	test: 0.0446263	best: 0.0446261 (499)	total: 1m 35s	remaining: 1m 34s
600:	learn: 0.0442812	test: 0.0446158	best: 0.0446158 (600)	total: 1m 53s	remaining: 1m 15s
700:	learn: 0.0442101	test: 0.0446060	best: 0.0446058 (698)	total: 2m 11s	remaining: 56.3s
800:	learn: 0.0441418	test: 0.0445992	best: 0.0445991 (799)	total: 2m 30s	remaining: 37.3s
900:	learn: 0.0440713	test: 0.0445904	best: 0.0445904 (900)	total: 2m 48s	remaining: 18.5s
999:	learn: 0.0440047	test: 0.0445823	best: 0.0445823 (999)	total: 

Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,0.005059,0.000655,0.010895,0.006276
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,0.012927,0.001622,0.011659,0.008533
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,0.012220,0.000758,0.011273,0.009265
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,0.009158,0.001873,0.009177,0.006621
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,0.005059,0.000655,0.010895,0.006276
...,...,...,...,...,...
140483,d49a66825bb16ceb5b6a01126e1f2391b085dba8da44ee...,0.011189,0.001302,0.016287,0.011656
140484,f772af6720c0b591d49b97946c5e420c1c077affc0f7c7...,0.005059,0.000655,0.010895,0.006276
140485,06b282335bc4853f888e1ab50a6ba23a8e420d42313959...,0.005059,0.000655,0.010895,0.006276
140486,90d423a25d7cdaf674f7d78bc37d88830443ff17717e02...,0.012823,0.001233,0.006897,0.011313


In [None]:
scores