In [2]:
import gc
import os

import numpy as np
import pandas as pd
import random
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset

import pytorch_lightning as pl
from sklearn.metrics import roc_auc_score

from transformers import DebertaV2ForSequenceClassification, DebertaV2Config, DebertaV2Model

In [3]:
torch.__version__, torch.cuda.is_available()

('2.0.1+cu118', True)

In [4]:
import os
os.environ['TRANSFORMERS_CACHE'] = './cache/'

In [5]:
pl.seed_everything(56, workers=True)

Global seed set to 56


56

In [6]:
columns = ['variantid', 'main_pic_embeddings_resnet_v1', 'name_bert_64']
train_data = pd.read_parquet('./datasets/train_data.parquet', columns=columns).set_index('variantid')
test_data = pd.read_parquet('./datasets/test_data.parquet', columns=columns).set_index('variantid')
train_data

Unnamed: 0_level_0,main_pic_embeddings_resnet_v1,name_bert_64
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1
51195767,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485..."
53565809,"[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897..."
56763357,"[[-0.90570974, 1.0296293, 1.0769907, 0.27746, ...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719..."
56961772,"[[0.13133773, -0.5577079, 0.32498044, 0.191717...","[-0.44812852, 0.5283565, 0.28981736, -0.506841..."
61054740,"[[0.21696381, 0.10989461, -0.08012986, 0.69186...","[-0.72692573, 0.75206333, 0.37740713, -0.52502..."
...,...,...
820128810,"[[-1.4492652, -0.80129164, -0.12344764, 0.7194...","[-0.8253241, 0.6785133, 0.53978086, -0.4888316..."
821135769,"[[0.012127608, -0.8534423, 0.5415518, -0.44912...","[-0.7413257, 0.46105132, 0.5639801, -0.5462132..."
822095690,"[[0.4248176, -0.15944786, -0.22844064, 0.42768...","[-0.49261805, 0.56726897, 0.7037877, -0.697246..."
822101044,"[[0.4248176, -0.15944786, -0.22844064, 0.42768...","[-0.44051006, 0.54029673, 0.63768685, -0.68040..."


In [7]:
train_pairs = pd.read_parquet('./datasets/train_pairs_w_target.parquet')
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [8]:
name_labse_embs = pd.read_parquet('datasets/name_labse_embs.parquet').set_index('variantid')
name_labse_embs

Unnamed: 0_level_0,name_labse_768
variantid,Unnamed: 1_level_1
51195767,"[-0.033874325, 0.03722446, 0.0029757991, 0.068..."
53565809,"[0.015568526, -0.03899538, 0.064447366, 0.0383..."
56763357,"[-0.033072222, -0.04237577, 0.020771954, 0.065..."
56961772,"[0.014727573, -0.025661988, 0.023943473, -0.00..."
61054740,"[0.043145332, -0.052424084, 0.017260496, 0.045..."
...,...
820128810,"[-0.003678058, -0.031628493, 0.0065589263, 0.0..."
821135769,"[-0.06858361, 0.027011767, -0.016400583, -0.02..."
822095690,"[-0.04474233, -0.034224413, 0.026076552, 0.026..."
822101044,"[-0.05541598, 0.000863006, 0.01093415, 0.02208..."


In [9]:
val_pairs = train_pairs[pd.read_csv('./datasets/val_idx.csv', index_col=0).values].copy()
train_pairs = train_pairs[pd.read_csv('./datasets/train_idx.csv', index_col=0).values].copy()

In [8]:
# говнокод, потом красивее сделаю, не запускать

pairs = {}
for target, v, u in zip(train_pairs.target, train_pairs.variantid1, train_pairs.variantid2):
    if not target:
        continue
    if v not in pairs:
        pairs[v] = {}
    if u not in pairs:
        pairs[u] = {}
        
    pairs[v][u] = pairs[u][v] = 1
    
new_pairs = set()
for b in tqdm(pairs):
    for a in pairs[b]:
        for c in pairs[b]:
            if a >= c:
                continue
            if pairs.get(a, {}).get(c, -1) != 1 and pairs.get(c, {}).get(a, -1) != 1:   
                new_pairs.add((a, c))
                
print(len(new_pairs))

tmp = []
for a, c in new_pairs:
    tmp.append({
        'target': 1,
        'variantid1': a,
        'variantid2': c
    })
train_pairs = pd.concat([train_pairs, pd.DataFrame(tmp)])
train_pairs

  0%|          | 0/151476 [00:00<?, ?it/s]

65608


Unnamed: 0,target,variantid1,variantid2
289224,0,510049910,510050530
2357,1,80637721,501978948
49071,0,53565392,89434955
10858,1,77458419,426023153
133164,0,563560139,653233648
...,...,...,...
65603,1,659695305,700171302
65604,1,773459545,776840249
65605,1,750245639,780670219
65606,1,532474266,621912422


In [10]:
class Args:
    batch_size = 128
    
args = Args()

In [11]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.main_pic_embs = data['main_pic_embeddings_resnet_v1']
        self.name_embs = data['name_bert_64']
        self.name_labse_embs = name_labse_embs['name_labse_768']
        self.pairs_len = len(self.pairs)
        
    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        return (
            torch.tensor(np.concatenate([self.main_pic_embs[id1][0], self.name_embs[id1], self.name_labse_embs[id1]])),
            torch.tensor(np.concatenate([self.main_pic_embs[id2][0], self.name_embs[id2], self.name_labse_embs[id2]])),
            target
        )

In [12]:
def get_data_loader(pairs, data, batch_size, drop_last, shuffle):
    dataset = ItemsDataset(pairs, data)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=0,
        drop_last=drop_last,
        shuffle=shuffle,
        pin_memory=True
    )
    return data_loader

In [13]:
def get_loaders(args):
    train_loader = get_data_loader(
        pairs=train_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=True,
        shuffle=True
    )
    
    val_loader = get_data_loader(
        pairs=val_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=False,
        shuffle=False
    )
    return train_loader, val_loader

In [14]:
train_loader, val_loader = get_loaders(args)
len(train_loader), len(val_loader)

(1596, 799)

In [15]:
next(iter(val_loader))

[tensor([[-0.0033,  0.3253, -0.3316,  ...,  0.0320, -0.0337,  0.0175],
         [ 0.5036,  1.1200,  0.4510,  ..., -0.0030, -0.0146,  0.0319],
         [ 0.6642,  0.1197, -0.4183,  ..., -0.0487,  0.0181, -0.0069],
         ...,
         [ 0.1602,  0.4336,  1.1637,  ..., -0.0239, -0.0025,  0.0517],
         [ 0.2676,  0.3940,  0.0577,  ..., -0.0442, -0.0347,  0.0413],
         [ 0.3402, -0.2419,  0.6450,  ...,  0.0063,  0.0079,  0.0351]]),
 tensor([[-0.0436,  0.4931, -0.3070,  ...,  0.0513,  0.0016,  0.0265],
         [ 0.4324,  0.8119,  0.3148,  ...,  0.0115, -0.0241,  0.0392],
         [ 0.5154, -0.0099, -0.0911,  ..., -0.0433, -0.0030,  0.0253],
         ...,
         [ 0.2441,  0.4533,  1.0758,  ..., -0.0172, -0.0187,  0.0324],
         [ 0.1112,  0.4518,  0.0059,  ..., -0.0591, -0.0101,  0.0195],
         [ 0.3746, -0.2451,  0.7209,  ...,  0.0077, -0.0119,  0.0352]]),
 tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1,

In [18]:
class EmbDeberta(pl.LightningModule):
    margin = 0.75
    
    def __init__(self):
        super(EmbDeberta, self).__init__()

        features_num = 128 + 64 + 768

        hidden_size = (features_num + 512) / 2

        self.embedding_dropout = nn.Dropout2d(0.05)

        deberta_cfg = DebertaV2Config(vocab_size = 100,
                              hidden_size = hidden_size,
                              num_hidden_layers = 8, # [1; 4]
                              num_attention_heads = 3, # [1, 3]
                              intermediate_size = 1024,
                              hidden_act = 'gelu',
                              hidden_dropout_prob = 0.1,
                              attention_probs_dropout_prob = 0.1,
                              max_position_embeddings = 512,
                              type_vocab_size = 2,
                              initializer_range = 0.02,
                              layer_norm_eps = 1e-12,
                              pad_token_id = 0,
                              position_embedding_type = 'absolute',
                              use_cache = True,
                              classifier_dropout = None,
                              )

        self.deberta = DebertaV2Model(deberta_cfg, )

        embedding_size = 256
        self.neck = nn.Sequential(
            nn.BatchNorm1d(features_num),
            nn.Linear(features_num, embedding_size, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(embedding_size),
            nn.Linear(embedding_size, embedding_size, bias=False),
            nn.BatchNorm1d(embedding_size),
        )
        
    def forward(self, x):

        concated_embeddings = torch.cat(x, dim=-1)
        dropout_embeddings = self.embedding_dropout(concated_embeddings)

        attention_mask = torch.ones(dropout_embeddings.size())
        last_hidden = self.deberta.encoder(concated_embeddings, attention_mask)

        last_hidden = torch.concat([last_hidden[0].mean(1), last_hidden[0].max(1)[0]], -1)

        outputs = self.neck(last_hidden)

        return outputs

    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=1e-4, betas=(0.9, 0.999), weight_decay=0.05
        )
        # optimizer = torch.optim.SGD(self.parameters(), lr=1e-1, momentum=0.9, weight_decay=0.0001)
        return (
            {
                "optimizer": optimizer,
            },
        )
    
    def training_step(self, batch, batch_idx):
        # self.log('step', batch_idx, logger=True, on_epoch=True)
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        out1 = out1.div(fnorm.expand_as(out1))
        fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        out2 = out2.div(fnorm.expand_as(out2))
        
        dists = nn.PairwiseDistance()(out1, out2)
        loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("train_loss", loss, on_step=False, logger=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):        
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        out1 = out1.div(fnorm.expand_as(out1))
        fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        out2 = out2.div(fnorm.expand_as(out2))
        
        dists = nn.PairwiseDistance()(out1, out2)
        loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("val_loss", loss, logger=False, on_epoch=True, prog_bar=True)   
        
        try:
            auc = roc_auc_score(labels.detach().cpu(), 1 - dists.detach().cpu())
        except:
            auc = 0
            
        self.log("val_auc", auc, logger=False, on_epoch=True, prog_bar=True)
        
    def train_dataloader(self):
        return train_loader

    def val_dataloader(self):
        return val_loader
    
    def predict_step(self, batch, batch_idx):
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        out1 = out1.div(fnorm.expand_as(out1))
        fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        out2 = out2.div(fnorm.expand_as(out2))
        
        dists = nn.PairwiseDistance()(out1, out2)
        return torch.cat([out1, out2, (1 - dists).unsqueeze(-1)], dim=1).detach().cpu()

In [19]:
model = EmbDeberta()

TypeError: empty(): argument 'size' must be tuple of ints, but found element of type float at pos 2

In [None]:
import torchinfo

torchinfo.summary(model)

In [23]:
trainer = pl.Trainer(
    logger=False, # CSVLogger('./'),
    enable_checkpointing=False,
    
    accelerator='gpu', 
    devices=1,
    profiler='advanced',
    precision=16,
    check_val_every_n_epoch=1,
    max_epochs=10
)

trainer.fit(model)

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(

  rank_zero_warn(



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [24]:
test_preds = trainer.predict(model, val_loader)

  rank_zero_warn(



Predicting: 0it [00:00, ?it/s]

In [25]:
roc_auc_score(val_pairs.target, np.concatenate([pred.numpy()[:, -1] for pred in test_preds]))

0.8705586711845904

In [26]:
train_dataset = ItemsDataset(train_pairs, train_data)
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    num_workers=0,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [27]:
features = np.concatenate([pred.numpy() for pred in trainer.predict(model, train_loader)])

  rank_zero_warn(



Predicting: 0it [00:00, ?it/s]

In [28]:
val_features = np.concatenate([pred.numpy() for pred in trainer.predict(model, val_loader)])

Predicting: 0it [00:00, ?it/s]

In [29]:
from catboost import CatBoostClassifier, Pool, cv

  from pandas import MultiIndex, Int64Index



In [30]:
train_pool = Pool(
    data=features,
    label=train_pairs.target,
)

val_pool = Pool(
    data=val_features,
    label=val_pairs.target,
)

In [47]:
params = {
    'loss_function': 'Logloss',
    'custom_metric': ['AUC'],
    'task_type': 'CPU',
}

In [48]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x1a692774700>

In [44]:
np.max(model_cb.get_evals_result()['validation']['AUC'])

0.8793450626201349

In [46]:
from sklearn.metrics import precision_recall_curve, auc

precision, recall, thrs = precision_recall_curve(val_pairs.target, model_cb.predict_proba(val_pool)[:, 1])
gt_prec_level_idx = np.where(precision >= 0.75)[0]
auc(recall[gt_prec_level_idx], precision[gt_prec_level_idx])

0.7306184112482237