In [1]:
import gc
import os

import json
import numpy as np
import pandas as pd
import random
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset

import lightning.pytorch as pl

from transformers import AutoTokenizer, AutoModel

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TRANSFORMERS_CACHE'] = './cache/'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import re

def text_preprocess(text):
    text = re.sub("[“”«»″]", '"', text)
    text = re.sub("‘’", '"', text)
    text = re.sub("’", "'", text)
    text = re.sub("[—–]+", '-', text)
    text = re.sub("【", '(', text)
    text = re.sub("】", ')', text)
    text = re.sub("[\\\]+|/", ' ', text)
    text = re.sub("[\n\t\xa0]", ' ', text)
    text = re.sub("[\u200b\xad•]", '', text)
    text = re.sub('"+', '"', text)
    text = re.sub(" +", ' ', text)
    text = text.strip()
    return text

In [3]:
torch.__version__, torch.cuda.is_available()

('2.0.1+cu118', True)

In [4]:
pl.seed_everything(56, workers=True)

Global seed set to 56


56

In [5]:
columns = ['variantid', 'name']

In [6]:
train_data = pd.read_parquet('./datasets/train_data.parquet', columns=columns).set_index('variantid')
train_data

Unnamed: 0_level_0,name
variantid,Unnamed: 1_level_1
51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,..."
53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...
56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан..."
56961772,"Мобильный телефон BQ 1848 Step, черный"
61054740,"Штатив трипод Tripod 330A для фотоаппаратов, в..."
...,...
820128810,"Комплект 2 шт, Чернила Cactus CS-EPT6733B пурп..."
821135769,"Защитное стекло закаленное Xiaomi Redmi 7, Y3 ..."
822095690,Системный блок ЮКОМС 9400-268 (AMD A6-9400 (3....
822101044,Системный блок ЮКОМС 9400-9 (AMD A6-9400 (3.4 ...


In [7]:
test_data = pd.read_parquet('./datasets/test_data.parquet', columns=columns).set_index('variantid')
test_data

Unnamed: 0_level_0,name
variantid,Unnamed: 1_level_1
51201254,Колодка TDM Electric четырехместная без заземл...
77151532,Клавиатура черная с черной рамкой для 25-011879
89664856,"15.6"" Игровой ноутбук Acer Predator Helios 300..."
90701982,Портативная колонка Borofone BR7 Empyreal Spor...
92484118,Аккумулятор для Meizu BA712 ( M6s )
...,...
702785891,Кабель USB - Lightning HOCO X21 PLUS (черно-бе...
704096517,Блок питания для ноутбука Asus f5gl (19V 90W 4...
705874953,Оперативная память HyperX FURY Black DDR4 2666...
706965102,8 ТБ Внутренний жесткий диск Toshiba TOSHIBA N...


In [8]:
train_pairs = pd.read_parquet('./datasets/train_pairs_w_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [9]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
test_pairs

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919
2,77243372,479860557
3,86065820,540678372
4,91566575,258840506
...,...,...
18079,666998614,667074522
18080,670036240,670048449
18081,670284509,684323809
18082,692172005,704805270


In [10]:
val_pairs = train_pairs[pd.read_csv('./datasets/val_idx.csv', index_col=0).values].copy()
train_pairs = train_pairs[pd.read_csv('./datasets/train_idx.csv', index_col=0).values].copy()

In [11]:
class Args:
    batch_size = 96
    epochs = 5
    lr = 1e-5
    lr_warmup_epochs = 5
    lr_warmup_decay = 0.01
    lr_min = 1e-5
    
args = Args()

In [12]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.pairs_len = len(self.pairs)
            
        self.names = data['name'].apply(text_preprocess)
        
    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        return (
            self.names[id1],
            self.names[id2],
            target
        )

In [13]:
def get_data_loader(pairs, data, batch_size, drop_last, shuffle):
    dataset = ItemsDataset(pairs, data)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=35,
        drop_last=drop_last,
        shuffle=shuffle,
        pin_memory=True
    )
    return data_loader

In [14]:
def get_loaders(args):
    train_loader = get_data_loader(
        pairs=train_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=True,
        shuffle=True
    )
    
    val_loader = get_data_loader(
        pairs=val_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=False,
        shuffle=False
    )
    return train_loader, val_loader

In [15]:
train_loader, val_loader = get_loaders(args)
len(train_loader), len(val_loader)

(2128, 1065)

In [16]:
#next(iter(val_loader)) # Спок

In [17]:
class LaBSE(pl.LightningModule):
    margin = 0.75
    
    def __init__(self):
        super(LaBSE, self).__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
        self.model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru')
        
        self.fc = nn.Linear(768, 768)
        
        #for param in self.model.embeddings.parameters():
        #    param.requires_grad = False
        #for param in self.model.encoder.parameters():
        #    param.requires_grad = False
        
    def forward(self, x):
        encoded_input = self.tokenizer(x, padding=True, truncation=True, max_length=256, return_tensors='pt').to('cuda')
        model_output = self.model(**encoded_input)
        
        embeddings = torch.nn.functional.normalize(model_output.pooler_output)    
        embeddings = self.fc(embeddings)
        return embeddings

    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=0.05
        )
        return optimizer
    
    def training_step(self, batch, batch_idx):
        # self.log('step', batch_idx, logger=True, on_epoch=True)
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        dists = nn.PairwiseDistance()(out1, out2)
        loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("train_loss", loss, on_step=False, logger=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):        
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        dists = nn.PairwiseDistance()(out1, out2)
        loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("val_loss", loss, logger=False, on_epoch=True, prog_bar=True)   
        
        try:
            auc = roc_auc_score(labels.detach().cpu(), 1 - dists.detach().cpu())
        except:
            auc = 0
            
        self.log("val_auc", auc, logger=False, on_epoch=True, prog_bar=True)
        
    def train_dataloader(self):
        return train_loader

    def val_dataloader(self):
        return val_loader
    
    def predict_step(self, batch, batch_idx):
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        dists = nn.PairwiseDistance()(out1, out2)
        return torch.cat([out1, out2, (1 - dists).unsqueeze(-1)], dim=1).detach().cpu()

In [18]:
model = LaBSE()

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
#model

In [20]:
import torchinfo

torchinfo.summary(model)

Layer (type:depth-idx)                                  Param #
LaBSE                                                   --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              42,303,744
│    │    └─Embedding: 3-2                              393,216
│    │    └─Embedding: 3-3                              1,536
│    │    └─LayerNorm: 3-4                              1,536
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             85,054,464
│    └─BertPooler: 2-3                                  --
│    │    └─Linear: 3-7                                 590,592
│    │    └─Tanh: 3-8                                   --
├─Linear: 1-2                                           590,592
Total params: 128,935,680
Trainable params: 128,935,680
Non-trainable par

In [None]:
trainer = pl.Trainer(
    logger=False, # CSVLogger('./'),
    enable_checkpointing=False,
    
    accelerator='gpu', 
    devices=[0],
    profiler='advanced',
    precision="16-mixed",
    check_val_every_n_epoch=1,
    max_epochs=args.epochs
)

trainer.fit(model)

In [None]:
test_preds = trainer.predict(model, val_loader)

In [24]:
roc_auc_score(val_pairs.target, np.concatenate([pred.numpy()[:, -1] for pred in test_preds]))

0.8869868830887131

# Инференс

In [25]:
train_dataset = ItemsDataset(train_pairs, train_data)
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    num_workers=37,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [None]:
features = np.concatenate([pred.numpy() for pred in trainer.predict(model, train_loader)])

In [None]:
val_features = np.concatenate([pred.numpy() for pred in trainer.predict(model, val_loader)])

In [28]:
from catboost import CatBoostClassifier, Pool, cv

In [29]:
train_pool = Pool(
    data=features,
    label=train_pairs.target,
)

val_pool = Pool(
    data=val_features,
    label=val_pairs.target,
)

In [30]:
params = {
    'loss_function': 'Logloss',
    'custom_metric': ['AUC'],
    'task_type': 'CPU',
}

In [31]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, verbose=False, plot=True, early_stopping_rounds=100)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7ff7681ba610>

In [32]:
np.max(model_cb.get_evals_result()['validation']['AUC'])

0.908631905205787

In [33]:
cv_data = cv(
    params=params,
    pool=val_pool,
    fold_count=3,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True,
    verbose=False
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.3730102263
bestIteration = 998

Training on fold [1/3]

bestTest = 0.3740584124
bestIteration = 999

Training on fold [2/3]

bestTest = 0.3787170071
bestIteration = 999



In [34]:
max_iter = cv_data['test-AUC-mean'].argmax()
print(f"Best iteration: {max_iter}\nBest AUC: {round(cv_data.iloc[max_iter]['test-AUC-mean'], 4)}±{round(cv_data.iloc[max_iter]['test-AUC-std'], 4)}")

Best iteration: 999
Best AUC: 0.9095±0.0016


In [35]:
test_pairs['target'] = -1
test_pairs = test_pairs[['target', 'variantid1', 'variantid2']]
test_dataset = ItemsDataset(test_pairs, test_data)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=args.batch_size,
    num_workers=0,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [None]:
test_features = np.concatenate([pred.numpy() for pred in trainer.predict(model, test_loader)])

In [37]:
train_embeds = pd.Series(index=train_data.index, dtype='object', name='labse_tuned_768')
train_embeds[val_pairs.variantid1] = list(val_features[:, :768])
train_embeds[val_pairs.variantid2] = list(val_features[:, 768:768*2])
train_embeds

variantid
51195767     [0.0023719287, 0.04152827, 0.0018715167, 0.035...
53565809     [0.022112135, -0.007468411, -0.0068747965, 0.0...
56763357                                                   NaN
56961772                                                   NaN
61054740     [0.01275732, 0.034349203, 0.008032634, 0.03303...
                                   ...                        
820128810                                                  NaN
821135769    [-0.015492215, -0.01956236, -0.008062369, -0.0...
822095690                                                  NaN
822101044                                                  NaN
822394794                                                  NaN
Name: labse_tuned_768, Length: 457063, dtype: object

In [38]:
test_embeds = pd.Series(index=test_data.index, dtype='object', name='labse_tuned_768')
test_embeds[test_pairs.variantid1] = list(test_features[:, :768])
test_embeds[test_pairs.variantid2] = list(test_features[:, 768:768*2])
test_embeds

variantid
51201254     [-0.0036340018, 0.05191699, -0.019618921, 0.03...
77151532     [0.020562124, -0.027739627, 0.003007357, 0.033...
89664856     [0.008961296, 0.0074943216, -0.0042574126, 0.0...
90701982     [0.0002899492, 0.00082552544, -0.0036707376, 0...
92484118     [0.0523341, -0.028439298, -0.0205805, 0.019437...
                                   ...                        
702785891    [0.015837612, -0.020126382, 0.01076397, 0.0433...
704096517    [-0.0046698838, -0.00089860754, -0.025120754, ...
705874953    [-0.012161249, 0.049054068, -0.02890747, 0.063...
706965102    [-0.021119665, 0.045512594, -0.011890485, 0.03...
707476739    [0.004665166, 0.03663141, 0.003135435, 0.04310...
Name: labse_tuned_768, Length: 35730, dtype: object

In [39]:
train_embeds.to_pickle('./datasets/labse_tuned_train.pickle')

In [40]:
test_embeds.to_pickle('./datasets/labse_tuned_test.pickle')

In [42]:
torch.save(model.state_dict(), "./LaBSE.pt")