In [1]:
import gc
import os

import json
import numpy as np
import pandas as pd
import random
from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset

import pytorch_lightning as pl

from transformers import AutoTokenizer, AutoModel

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TRANSFORMERS_CACHE'] = './cache/'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
torch.__version__, torch.cuda.is_available()

('2.0.1+cu118', True)

In [3]:
pl.seed_everything(56, workers=True)

Global seed set to 56


56

In [4]:
columns = ['variantid', 'name']

In [5]:
train_data = pd.read_parquet('./datasets/train_data.parquet', columns=columns).set_index('variantid')
train_data

Unnamed: 0_level_0,name
variantid,Unnamed: 1_level_1
51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,..."
53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...
56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан..."
56961772,"Мобильный телефон BQ 1848 Step, черный"
61054740,"Штатив трипод Tripod 330A для фотоаппаратов, в..."
...,...
820128810,"Комплект 2 шт, Чернила Cactus CS-EPT6733B пурп..."
821135769,"Защитное стекло закаленное Xiaomi Redmi 7, Y3 ..."
822095690,Системный блок ЮКОМС 9400-268 (AMD A6-9400 (3....
822101044,Системный блок ЮКОМС 9400-9 (AMD A6-9400 (3.4 ...


In [6]:
test_data = pd.read_parquet('./datasets/test_data.parquet', columns=columns).set_index('variantid')
test_data

Unnamed: 0_level_0,name
variantid,Unnamed: 1_level_1
51201254,Колодка TDM Electric четырехместная без заземл...
77151532,Клавиатура черная с черной рамкой для 25-011879
89664856,"15.6"" Игровой ноутбук Acer Predator Helios 300..."
90701982,Портативная колонка Borofone BR7 Empyreal Spor...
92484118,Аккумулятор для Meizu BA712 ( M6s )
...,...
702785891,Кабель USB - Lightning HOCO X21 PLUS (черно-бе...
704096517,Блок питания для ноутбука Asus f5gl (19V 90W 4...
705874953,Оперативная память HyperX FURY Black DDR4 2666...
706965102,8 ТБ Внутренний жесткий диск Toshiba TOSHIBA N...


In [7]:
train_pairs = pd.read_parquet('./datasets/train_pairs_w_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [8]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
test_pairs

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919
2,77243372,479860557
3,86065820,540678372
4,91566575,258840506
...,...,...
18079,666998614,667074522
18080,670036240,670048449
18081,670284509,684323809
18082,692172005,704805270


In [9]:
val_pairs = train_pairs[pd.read_csv('./datasets/val_idx.csv', index_col=0).values].copy()
train_pairs = train_pairs[pd.read_csv('./datasets/train_idx.csv', index_col=0).values].copy()

In [15]:
class Args:
    batch_size = 96
    epochs = 15
    lr = 1e-4
    lr_warmup_epochs = 5
    lr_warmup_decay = 0.01
    lr_min = 1e-5
    
args = Args()

In [16]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.pairs_len = len(self.pairs)
            
        self.names = data['name']
        
    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        return (
            self.names[id1],
            self.names[id2],
            target
        )

In [17]:
def get_data_loader(pairs, data, batch_size, drop_last, shuffle):
    dataset = ItemsDataset(pairs, data)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=35,
        drop_last=drop_last,
        shuffle=shuffle,
        pin_memory=True
    )
    return data_loader

In [18]:
def get_loaders(args):
    train_loader = get_data_loader(
        pairs=train_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=True,
        shuffle=True
    )
    
    val_loader = get_data_loader(
        pairs=val_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=False,
        shuffle=False
    )
    return train_loader, val_loader

In [19]:
train_loader, val_loader = get_loaders(args)
len(train_loader), len(val_loader)

(2128, 1065)

In [20]:
# next(iter(val_loader))

In [21]:
class LaBSE(pl.LightningModule):
    margin = 0.75
    
    def __init__(self):
        super(LaBSE, self).__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
        self.model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru')
        
        self.fc = nn.Linear(768*2, 768)
        
        for param in self.model.embeddings.parameters():
            param.requires_grad = False
        
    def forward(self, x):
        encoded_input = self.tokenizer(x, padding=True, truncation=True, max_length=256, return_tensors='pt').to('cuda')
        model_output = self.model(**encoded_input)
        last_hidden = torch.concat([model_output[0].mean(1), model_output[0].max(1)[0]], -1)
        embeddings = self.fc(last_hidden)
        embeddings = torch.nn.functional.normalize(embeddings)
        return embeddings

    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=0.05
        )
        main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min
        )
        warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
            optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs
        )
        lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
            optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs]
        )
        return (
            {
                "optimizer": optimizer,
                "lr_scheduler": {
                    "scheduler": lr_scheduler,
                    "interval": "epoch",
                    "frequency": 1,
                    "strict": True,
                },
            },
        )
    
    def training_step(self, batch, batch_idx):
        # self.log('step', batch_idx, logger=True, on_epoch=True)
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        dists = nn.PairwiseDistance()(out1, out2)
        loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("train_loss", loss, on_step=False, logger=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):        
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        dists = nn.PairwiseDistance()(out1, out2)
        loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("val_loss", loss, logger=False, on_epoch=True, prog_bar=True)   
        
        try:
            auc = roc_auc_score(labels.detach().cpu(), 1 - dists.detach().cpu())
        except:
            auc = 0
            
        self.log("val_auc", auc, logger=False, on_epoch=True, prog_bar=True)
        
    def train_dataloader(self):
        return train_loader

    def val_dataloader(self):
        return val_loader
    
    def predict_step(self, batch, batch_idx):
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        dists = nn.PairwiseDistance()(out1, out2)
        return torch.cat([out1, out2, (1 - dists).unsqueeze(-1)], dim=1).detach().cpu()

In [22]:
model = LaBSE()

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
model

LaBSE(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(55083, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [24]:
import torchinfo

torchinfo.summary(model)

Layer (type:depth-idx)                                  Param #
LaBSE                                                   --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              (42,303,744)
│    │    └─Embedding: 3-2                              (393,216)
│    │    └─Embedding: 3-3                              (1,536)
│    │    └─LayerNorm: 3-4                              (1,536)
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             85,054,464
│    └─BertPooler: 2-3                                  --
│    │    └─Linear: 3-7                                 590,592
│    │    └─Tanh: 3-8                                   --
├─Linear: 1-2                                           590,592
Total params: 128,935,680
Trainable params: 86,235,648
Non-traina

In [None]:
trainer = pl.Trainer(
    logger=False, # CSVLogger('./'),
    enable_checkpointing=False,
    
    accelerator='gpu', 
    devices=1,
    profiler='advanced',
    precision="16",
    check_val_every_n_epoch=1,
    max_epochs=args.epochs
)

trainer.fit(model)

  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type      | Params
------------------------------------
0 | model | BertModel | 128 M 
1 | fc    | Linear    | 590 K 
------------------------------------
86.2 M    Trainable params
42.7 M    Non-trainable params
128 M     Total params
515.743   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
test_preds = trainer.predict(model, val_loader)

In [None]:
roc_auc_score(val_pairs.target, np.concatenate([pred.numpy()[:, -1] for pred in test_preds]))

In [None]:
# 