In [1]:
import gc
import os

import numpy as np
import pandas as pd
import random
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset

import pytorch_lightning as pl

import torch.nn.functional as F

from sklearn.metrics import roc_auc_score

In [2]:
torch.__version__, torch.cuda.is_available()

('2.0.1+cu118', True)

In [3]:
pl.seed_everything(56)

Global seed set to 56


56

In [4]:
columns = ['variantid', 'main_pic_embeddings_resnet_v1', 'name_bert_64']
train_data = pd.read_parquet('./datasets/train_data.parquet', columns=columns).set_index('variantid')
test_data = pd.read_parquet('./datasets/test_data.parquet', columns=columns).set_index('variantid')
train_data

Unnamed: 0_level_0,main_pic_embeddings_resnet_v1,name_bert_64
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1
51195767,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485..."
53565809,"[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897..."
56763357,"[[-0.90570974, 1.0296293, 1.0769907, 0.27746, ...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719..."
56961772,"[[0.13133773, -0.5577079, 0.32498044, 0.191717...","[-0.44812852, 0.5283565, 0.28981736, -0.506841..."
61054740,"[[0.21696381, 0.10989461, -0.08012986, 0.69186...","[-0.72692573, 0.75206333, 0.37740713, -0.52502..."
...,...,...
820128810,"[[-1.4492652, -0.80129164, -0.12344764, 0.7194...","[-0.8253241, 0.6785133, 0.53978086, -0.4888316..."
821135769,"[[0.012127608, -0.8534423, 0.5415518, -0.44912...","[-0.7413257, 0.46105132, 0.5639801, -0.5462132..."
822095690,"[[0.4248176, -0.15944786, -0.22844064, 0.42768...","[-0.49261805, 0.56726897, 0.7037877, -0.697246..."
822101044,"[[0.4248176, -0.15944786, -0.22844064, 0.42768...","[-0.44051006, 0.54029673, 0.63768685, -0.68040..."


In [5]:
train_pairs = pd.read_parquet('./datasets/train_pairs.parquet')
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [6]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(train_pairs, test_size=1/3, random_state=56)

In [7]:
class Args:
    batch_size = 128
    
args = Args()

In [8]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.main_pic_embs = data['main_pic_embeddings_resnet_v1']
        self.name_embs = data['name_bert_64']
        self.pairs_len = len(self.pairs)
        
    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        return (
            torch.tensor(np.concatenate([self.main_pic_embs[id1][0], self.name_embs[id1]])),
            torch.tensor(np.concatenate([self.main_pic_embs[id2][0], self.name_embs[id2]])),
            target
        )

In [9]:
def get_data_loader(pairs, data, batch_size, drop_last, shuffle):
    dataset = ItemsDataset(pairs, data)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=0,
        drop_last=drop_last,
        shuffle=shuffle,
        pin_memory=True
    )
    return data_loader

In [10]:
def get_loaders(args):
    train_loader = get_data_loader(
        pairs=train_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=True,
        shuffle=True
    )
    
    val_loader = get_data_loader(
        pairs=val_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=False,
        shuffle=False
    )
    return train_loader, val_loader

In [11]:
train_loader, val_loader = get_loaders(args)
len(train_loader), len(val_loader)

(1596, 799)

In [12]:
next(iter(train_loader))

[tensor([[ 0.1161, -1.0675,  0.3716,  ...,  0.2046,  0.3852, -0.5351],
         [-0.0278,  0.0919,  0.4331,  ...,  0.5052,  0.5721, -0.5052],
         [ 0.2479, -0.3031,  0.0908,  ...,  0.4779,  0.7032, -0.4154],
         ...,
         [-0.2100, -0.5099,  0.3891,  ...,  0.6622,  0.6652, -0.5941],
         [ 0.6944,  0.5330, -1.5235,  ...,  0.6289,  0.6613, -0.2806],
         [ 0.1003, -0.4157, -0.1716,  ...,  0.7656,  0.6723, -0.6463]]),
 tensor([[ 0.0578, -1.3894,  0.4709,  ...,  0.2104,  0.4125, -0.5081],
         [-0.0278,  0.0919,  0.4331,  ...,  0.5052,  0.5721, -0.5052],
         [ 0.2479, -0.3031,  0.0908,  ...,  0.4168,  0.6963, -0.5191],
         ...,
         [-0.1911, -0.5303,  0.3174,  ...,  0.6620,  0.6665, -0.7083],
         [ 0.7403,  0.3063, -1.2884,  ...,  0.6449,  0.6292, -0.1368],
         [ 0.1471, -0.1464, -0.2291,  ...,  0.7421,  0.6787, -0.5781]]),
 tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
         0, 0, 1, 0, 0, 0, 1, 0, 0,

In [16]:
class Net(pl.LightningModule):
    margin = 0.5
    
    def __init__(self):
        super(Net, self).__init__()

        features_num = 128 + 64
        embedding_size = 256


        self.layer_norm1 = nn.LayerNorm(features_num)
        self.linear1 = nn.Linear(features_num, embedding_size, bias=False)
        self.norm = nn.BatchNorm1d(embedding_size)

        self.linear2 = nn.Linear(embedding_size, embedding_size, bias=False)
        self.layer_norm2 = nn.LayerNorm(embedding_size)

        

        
    def forward(self, x):
        x = F.relu(self.linear1(self.layer_norm1(x)))
        x = self.layer_norm2(self.linear2(self.norm(x)))
        return x
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=3e-4, betas=(0.9, 0.999), weight_decay=0.05
        )
        return (
            {
                "optimizer": optimizer,
            },
        )
    
    def training_step(self, batch, batch_idx):
        self.log('step', batch_idx, logger=True, on_epoch=True)
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        out1 = out1.div(fnorm.expand_as(out1))
        fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        out2 = out2.div(fnorm.expand_as(out2))
        
        dists = nn.PairwiseDistance()(out1, out2)
        # loss = (1 - labels - dists).abs().mean()
        loss = (1 - labels) * torch.pow(dists, 2) + (labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("train_loss", loss, logger=True, on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):        
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)
        
        fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        out1 = out1.div(fnorm.expand_as(out1))
        fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        out2 = out2.div(fnorm.expand_as(out2))
        
        dists = nn.PairwiseDistance()(out1, out2)
        # loss = (1 - labels - dists).abs().mean()
        loss = (1 - labels) * torch.pow(dists, 2) + (labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        loss = torch.mean(loss)
        self.log("val_loss", loss, logger=True, on_epoch=True, prog_bar=True)   

        auc = roc_auc_score(labels.detach().cpu(), dists.detach().cpu())

        self.log("val_auc", auc, logger=True, on_epoch=True, prog_bar=True)
        
    def train_dataloader(self):
        return train_loader

    def val_dataloader(self):
        return val_loader

In [17]:
model = Net()

In [18]:
import torchinfo

torchinfo.summary(model)

Layer (type:depth-idx)                   Param #
Net                                      --
├─LayerNorm: 1-1                         384
├─Linear: 1-2                            49,152
├─BatchNorm1d: 1-3                       512
├─Linear: 1-4                            65,536
├─LayerNorm: 1-5                         512
Total params: 116,096
Trainable params: 116,096
Non-trainable params: 0

In [19]:
trainer = pl.Trainer(
    accelerator='gpu', 
    devices=1,
    profiler='advanced',
    precision=16,
    check_val_every_n_epoch=1,
    max_epochs=10
)

trainer.fit(model)

  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type        | Params
--------------------------------------------
0 | layer_norm1 | LayerNorm   | 384   
1 | linear1     | Linear      | 49.2 K
2 | norm        | BatchNorm1d | 512   
3 | linear2     | Linear      | 65.5 K
4 | layer_norm2 | LayerNorm   | 512   
--------------------------------------------
116 K     Trainable params
0         Non-trainable params
116 K     Total params
0.464     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
FIT Profiler Report
Profile stats for: [LightningModule]Net.configure_callbacks
         7 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 contextlib.py:139(__exit__)
        1    0.000    0.000    0.000    0.000 {built-in method builtins.next}
        1    0.000    0.000    0.000    0.000 profiler.py:54(profile)
        1    0.000    0.000    0.000    0.000 advanced.py:66(stop)
        1    0.000    0.000    0.000    0.000 module.py:889(configure_callbacks)
        1    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}



Profile stats for: [LightningModule]Net.prepare_data
         7 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall fi

In [20]:
dists, labels = [], []
for x1, x2, target in tqdm(val_loader):
    out1 = model.forward(x1).detach().cpu()
    out2 = model.forward(x2).detach().cpu()
    
    fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
    out1 = out1.div(fnorm.expand_as(out1))
    fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
    out2 = out2.div(fnorm.expand_as(out2))
        
    dists.append(
        nn.PairwiseDistance()(out1, out2).numpy()
    )
    labels.append(target.numpy())
dists = np.concatenate(dists)
labels = np.concatenate(labels)

  0%|          | 0/799 [00:00<?, ?it/s]

In [21]:
dists

array([0.06061221, 0.266888  , 0.14315976, ..., 0.25417012, 0.15262319,
       0.09018481], dtype=float32)

In [22]:
labels

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [23]:
roc_auc_score(labels, dists)

0.8277732715835977

In [24]:
train_dataset = ItemsDataset(train_pairs, train_data)
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    num_workers=0,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [25]:
features, labels = [], []
for x1, x2, target in tqdm(train_loader):
    out1 = model.forward(x1).detach().cpu()
    out2 = model.forward(x2).detach().cpu()
    
    fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
    out1 = out1.div(fnorm.expand_as(out1))
    fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
    out2 = out2.div(fnorm.expand_as(out2))
    
    features.append(np.concatenate([out1, out2, nn.PairwiseDistance()(out1, out2).numpy().reshape((-1, 1))], axis=1))
    labels.append(target.numpy())
features = np.concatenate(features)
labels = np.concatenate(labels)    

  0%|          | 0/1597 [00:00<?, ?it/s]

In [26]:
val_features, val_labels = [], []
for x1, x2, target in tqdm(val_loader):
    out1 = model.forward(x1).detach().cpu()
    out2 = model.forward(x2).detach().cpu()
    
    fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
    out1 = out1.div(fnorm.expand_as(out1))
    fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
    out2 = out2.div(fnorm.expand_as(out2))
    
    val_features.append(np.concatenate([out1, out2, nn.PairwiseDistance()(out1, out2).numpy().reshape((-1, 1))], axis=1))
    val_labels.append(target.numpy())
val_features = np.concatenate(val_features)
val_labels = np.concatenate(val_labels)

  0%|          | 0/799 [00:00<?, ?it/s]

In [27]:
from catboost import CatBoostClassifier, Pool, cv

In [28]:
train_pool = Pool(
    data=features,
    label=labels,
)

val_pool = Pool(
    data=val_features,
    label=val_labels,
)

In [36]:
params = {
    'iterations' : 1777,
    'loss_function': 'CrossEntropy',
    'custom_metric': ['AUC'],
    'task_type': 'CPU',
}

In [37]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x24e1ac57d60>

In [38]:
np.max(model_cb.get_evals_result()['validation']['AUC'])

0.8503593799412676