# Импорты и пути к моделям

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm

import pandas as pd

import lightning.pytorch as pl

from utils import get_name_labse_embs, text_preprocess

import json
from nltk.tokenize import RegexpTokenizer
import gc
from thefuzz import fuzz
import numpy as np
from tqdm.auto import tqdm
from catboost import CatBoostClassifier, Pool
import warnings

from torch.utils.data import Dataset

from lightning.pytorch.callbacks import ModelCheckpoint

from torch import nn

from sklearn.metrics import roc_auc_score

In [3]:
PATH_TO_LABSE = "./models/LaBSE.pt"
PATH_TO_MULTIMODAL = "./models/Multi.pt"

In [None]:
pl.seed_everything(56, workers=True)

# Получаем эмбединги названий от LaBSE

In [4]:
named_data = pd.read_parquet('./datasets/test_data.parquet', columns=["variantid", "name"])

In [5]:
named_data.head()

Unnamed: 0,variantid,name
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,..."
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...
2,56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан..."
3,56961772,"Мобильный телефон BQ 1848 Step, черный"
4,61054740,"Штатив трипод Tripod 330A для фотоаппаратов, в..."


In [None]:
name_labse_768 = get_name_labse_embs("cointegrated/LaBSE-en-ru", sentences=list(named_data["name"]), device=torch.device("cuda"))
name_labse_768.to_pickle('./datasets/name_labse_embs_test.pickle')

# Получаем эмбединги названий от дообученной LaBSE

In [None]:
class Args:
    batch_size = 96
    epochs = 5
    lr = 1e-5
    lr_warmup_epochs = 5
    lr_warmup_decay = 0.01
    lr_min = 1e-5

args = Args()

In [None]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.pairs_len = len(self.pairs)

        self.names = data['name'].apply(text_preprocess)

    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        return (
            self.names[id1],
            self.names[id2],
            target
        )

In [None]:
class LaBSE(pl.LightningModule):
    margin = 0.75

    def __init__(self):
        super(LaBSE, self).__init__()

        self.tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
        self.model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru')

        self.fc = nn.Linear(768, 768)

    def forward(self, x):
        encoded_input = self.tokenizer(x, padding=True, truncation=True, max_length=256, return_tensors='pt').to('cuda')
        model_output = self.model(**encoded_input)

        embeddings = torch.nn.functional.normalize(model_output.pooler_output)
        embeddings = self.fc(embeddings)
        return embeddings

    def predict_step(self, batch, batch_idx):
        x1, x2, labels = batch
        out1 = self.forward(x1)
        out2 = self.forward(x2)

        dists = nn.PairwiseDistance()(out1, out2)
        return torch.cat([out1, out2, (1 - dists).unsqueeze(-1)], dim=1).detach().cpu()

In [None]:
model = LaBSE()

In [None]:
model.load_state_dict(torch.load(PATH_TO_LABSE, map_location=torch.device('cpu')))

In [None]:
trainer = pl.Trainer(
    logger=False, # CSVLogger('./'),
    enable_checkpointing=False,

    accelerator='gpu',
    devices=[0],
    profiler='advanced',
    precision="16-mixed",
    check_val_every_n_epoch=1,
    max_epochs=args.epochs
)

In [None]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
test_data = pd.read_parquet('./datasets/test_data.parquet', columns=['variantid', 'name']).set_index('variantid')

In [None]:
test_pairs['target'] = -1
test_pairs = test_pairs[['target', 'variantid1', 'variantid2']]
test_dataset = ItemsDataset(test_pairs, test_data)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=args.batch_size,
    num_workers=17,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [None]:
test_features = np.concatenate([pred.numpy() for pred in trainer.predict(model, test_loader)])

In [None]:
labse_tuned_768 = pd.Series(index=test_data.index, dtype='object', name='labse_tuned_768')
labse_tuned_768[test_pairs.variantid1] = list(test_features[:, :768])
labse_tuned_768[test_pairs.variantid2] = list(test_features[:, 768:768*2])
labse_tuned_768.to_pickle('./datasets/labse_tuned_test.pickle')

# Получаем эмбединги от мультимодальной сети

In [None]:
test_data = pd.read_parquet('./datasets/test_data.parquet').set_index('variantid')
test_data['categories'] = test_data['categories'].apply(lambda x: json.loads(x))
test_data['main_pic_embeddings_resnet_v1'] = test_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
test_data

In [None]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
test_pairs

In [None]:
test_cat3 = set()
for categories in test_data.categories:
    test_cat3.add(categories['3'])

In [None]:
colors_mapper = {
 'ярко-синий': 'ярко-синий',
 'ярко-розовый': 'ярко-розовый',
 'ярко-зеленый': 'ярко-зеленый',
 'ярко-желтый': 'ярко-желтый',
 'янтарный': 'янтарный',
 'электрик': 'электрик',
 'экрю': 'экрю',
 'шоколадный': 'шоколадный',
 'черный': 'черный',
 'черно-синий': 'черно-синий',
 'черно-серый': 'черно-серый',
 'черно-красный': 'черно-красный',
 'черно-зеленый': 'черно-зеленый',
 'черн': 'черный',
 'чер': 'черный',
 'циан': 'бирюзовый',
 'цементный': 'цементный',
 'хаки': 'хаки',
 'фуксия': 'фуксия',
 'фисташковый': 'фисташковый',
 'фиолетовый': 'фиолетовый',
 'фиолетово-синий': 'фиолетово-синий',
 'фиолет': 'фиолетовый',
 'фиол': 'фиолетовый',
 'фиалковый': 'фиалковый',
 'тыквенный': 'тыквенный',
 'тыква': 'тыквенный',
 'травяной': 'травяной',
 'томатный': 'томатный',
 'тиффани': 'тиффани',
 'терракотовый': 'терракотовый',
 'терракота': 'терракотовый',
 'темно-фиолетовый': 'темно-фиолетовый',
 'темно-синий': 'темно-синий',
 'темно-серый': 'темно-серый',
 'темно-розовый': 'темно-розовый',
 'темно-оранжевый': 'темно-оранжевый',
 'темно-оливковый': 'темно-оливковый',
 'темно-красный': 'темно-красный',
 'темно-коричневый': 'темно-коричневый',
 'темно-зеленый': 'темно-зеленый',
 'темно-голубой': 'темно-голубой',
 'темно-бирюзовый': 'темно-бирюзовый',
 'темно-бежевый': 'темно-бежевый',
 'сливовый': 'сливовый',
 'сиреневый': 'сиреневый',
 'синий': 'синий',
 'сине-зеленый': 'сине-зеленый',
 'син': 'синий',
 'серый': 'серый',
 'серовато-зеленый': 'серовато-зеленый',
 'серо-коричневый': 'серо-коричневый',
 'серо-зеленый': 'серо-зеленый',
 'серо-голубой': 'серо-голубой',
 'серо-бежевый': 'серо-бежевый',
 'серебряный': 'серебряный',
 'серебристый': 'серебристый',
 'серебристо-серый': 'серебристо-серый',
 'сер': 'серый',
 'сепия': 'сепия',
 'светло-фиолетовый': 'светло-фиолетовый',
 'светло-синий': 'светло-синий',
 'светло-серый': 'светло-серый',
 'светло-розовый': 'светло-розовый',
 'светло-пурпурный': 'светло-пурпурный',
 'светло-коричневый': 'светло-коричневый',
 'светло-золотистый': 'светло-золотистый',
 'светло-зеленый': 'светло-зеленый',
 'светло-желтый': 'светло-желтый',
 'светло-голубой': 'светло-голубой',
 'светло-бирюзовый': 'светло-бирюзовый',
 'светло-бежевый': 'светло-бежевый',
 'сапфировый': 'сапфировый',
 'салатовый': 'салатовый',
 'рыжий': 'рыжий',
 'розовый': 'розовый',
 'розово-фиолетовый': 'розово-фиолетовый',
 'розово-золотой': 'розово-золотой',
 'разноцветный': 'разноцветный',
 'пурпурный': 'пурпурный',
 'пурпурно-фиолетовый': 'пурпурно-фиолетовый',
 'песочный': 'песочный',
 'перу': 'перу',
 'персиковый': 'персиковый',
 'охра': 'охра',
 'орхидея': 'орхидея',
 'оранжевый': 'оранжевый',
 'оранжево-розовый': 'оранжево-розовый',
 'оливковый': 'оливковый',
 'огненно-красный': 'огненно-красный',
 'нефритовый': 'нефритовый',
 'небесный': 'небесный',
 'мятный': 'мятный',
 'мятно-зеленый': 'мятно-зеленый',
 'мята': 'мятный',
 'мультиколор': 'мультиколор',
 'морковный': 'морковный',
 'молочный': 'молочный',
 'многоцветный': 'многоцветный',
 'медный': 'медный',
 'марсала': 'марсала',
 'малиновый': 'малиновый',
 'малиново-красный': 'малиново-красный',
 'малахитовый': 'малахитовый',
 'льняной': 'льняной',
 'лимонный': 'лимонный',
 'лиловый': 'лиловый',
 'латунный': 'латунный',
 'лаймовый': 'лаймовый',
 'лайм': 'лаймовый',
 'лазурный': 'лазурный',
 'лавандовый': 'лавандовый',
 'лаванда': 'лавандовый',
 'кремовый': 'кремовый',
 'красный': 'красный',
 'красновато-коричневый': 'красновато-коричневый',
 'красно-оранжевый': 'красно-оранжевый',
 'красно-коричневый': 'красно-коричневый',
 'красн': 'красный',
 'крас': 'красный',
 'кофейный': 'кофейный',
 'космос': 'космос',
 'коричневый': 'коричневый',
 'коричнево-красный': 'коричнево-красный',
 'коричнево-бежевый': 'коричнево-бежевый',
 'коралловый': 'коралловый',
 'кораллово-красный': 'кораллово-красный',
 'кобальтовый': 'кобальтовый',
 'кирпичный': 'кирпичный',
 'кирпично-красный': 'кирпично-красный',
 'кварцевый': 'кварцевый',
 'кардинал': 'кардинал',
 'канареечный': 'канареечный',
 'камуфляжный': 'камуфляжный',
 'индиго': 'индиго',
 'изумрудный': 'изумрудный',
 'изумрудно-зеленый': 'изумрудно-зеленый',
 'изумруд': 'изумрудный',
 'золотой': 'золотой',
 'золотистый': 'золотистый',
 'зеленый': 'зеленый',
 'зелено-серый': 'зелено-серый',
 'зел': 'зеленый',
 'жемчужно-белый': 'жемчужно-белый',
 'желтый': 'желтый',
 'желто-розовый': 'желто-розовый',
 'желто-зеленый': 'желто-зеленый',
 'желт': 'желтый',
 'гусеница': 'гусеница',
 'грушевый': 'грушевый',
 'графит': 'графит',
 'гранитный': 'гранитный',
 'гранатовый': 'гранатовый',
 'горчичный': 'горчичный',
 'голубой': 'голубой',
 'голуб': 'голубой',
 'глициния': 'глициния',
 'вишня': 'вишневый',
 'вишневый': 'вишневый',
 'васильковый': 'васильковый',
 'ванильный': 'ванильный',
 'бурый': 'бурый',
 'бронзовый': 'бронзовый',
 'бордовый': 'бордовый',
 'бордо': 'бордовый',
 'болотный': 'болотный',
 'бледно-розовый': 'бледно-розовый',
 'бледно-пурпурный': 'бледно-пурпурный',
 'бледно-желтый': 'бледно-желтый',
 'бирюзовый': 'бирюзовый',
 'бирюзово-зеленый': 'бирюзово-зеленый',
 'белый': 'белый',
 'белоснежный': 'белоснежный',
 'бело-зеленый': 'бело-зеленый',
 'бел': 'белый',
 'бежевый': 'бежевый',
 'бежево-серый': 'бежево-серый',
 'бежево-розовый': 'бежево-розовый',
 'баклажановый': 'баклажановый',
 'антрацитовый': 'антрацитовый',
 'аметистовый': 'аметистовый',
 'алый': 'алый',
 'аквамариновый': 'аквамариновый',
 'аква': 'аква',
 'абрикосовый': 'абрикосовый',
 'yellow': 'желтый',
 'wine': 'wine',
 'white': 'белый',
 'violet': 'фиолетовый',
 'vanilla': 'ванильный',
 'ultramarine': 'ultramarine',
 'turquoise': 'бирюзовый',
 'tomato': 'томатный',
 'teal': 'teal',
 'tan': 'tan',
 'snow': 'snow',
 'silver': 'серебряный',
 'sapphire': 'сапфировый',
 'red': 'красный',
 'purple': 'фиолетовый',
 'pink': 'розовый',
 'peru': 'перу',
 'pear': 'грушевый',
 'peach': 'персиковый',
 'orchid': 'орхидея',
 'orange': 'оранжевый',
 'olive': 'оливковый',
 'navy': 'navy',
 'magenta': 'пурпурный',
 'linen': 'linen',
 'lime': 'лаймовый',
 'lilac': 'сиреневый',
 'lemon': 'lemon',
 'lavender': 'лавандовый',
 'khaki': 'хаки',
 'jade': 'нефритовый',
 'ivory': 'ivory',
 'indigo': 'индиго',
 'grey': 'серый',
 'green': 'зеленый',
 'gray': 'серый',
 'gold': 'золотой',
 'fuchsia': 'фуксия',
 'flax': 'flax',
 'emerald': 'emerald',
 'denim': 'denim',
 'cyan': 'бирюзовый',
 'cream': 'кремовый',
 'corn': 'corn',
 'coral': 'коралловый',
 'copper': 'медный',
 'cobalt': 'кобальтовый',
 'chocolate': 'шоколадный',
 'burgundy': 'бордовый',
 'buff': 'buff',
 'brown': 'коричневый',
 'bronze': 'бронзовый',
 'brass': 'латунный',
 'blue': 'голубой',
 'blond': 'blond',
 'black': 'черный',
 'beige': 'бежевый',
 'azure': 'лазурный',
 'aquamarine': 'аквамариновый',
 'aqua': 'аквамариновый',
 'amethyst': 'аметистовый',
 'amber': 'янтарный'
}

In [None]:
color_vocab = {}
for color, v in colors_mapper.items():
    color_vocab[v] = len(color_vocab) + 1

In [None]:
class Args:
    batch_size = 96
    epochs = 10
    lr = 1e-5

args = Args()

In [None]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.pairs_len = len(self.pairs)

        self.main_pic_embs = data['main_pic_embeddings_resnet_v1']

        categories = data['categories'].copy().apply(lambda x: x['3'])
        categories[~categories.isin(categories_map)] = 'rest'
        self.categories = categories.apply(lambda v: categories_map[v])

        def color_to_idx(colors):
            if colors is None:
                return []
            return [color_vocab[colors_mapper[color]] for color in colors]
        def drop_dup_colors(colors):
            if colors is None:
                return []
            res = []
            for v in colors:
                if v not in res:
                    res.append(v)
            return res
        colors = data['color_parsed'].copy().apply(color_to_idx).apply(drop_dup_colors)
        def pad_colors(colors):
            max_colors = 17
            if len(colors) > max_colors:
                return colors[:max_colors]
            return colors + [0] * (max_colors - len(colors))
        self.colors = colors.apply(pad_colors)

        self.names = data['name'].apply(text_preprocess)

        self.name_bert_embs = data['name_bert_64']

    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        return (
            self.categories[id1],
            torch.tensor(self.colors[id1]),
            self.names[id1],
            torch.tensor(self.main_pic_embs[id1]),
            torch.tensor(self.name_bert_embs[id1]),

            self.categories[id2],
            torch.tensor(self.colors[id2]),
            self.names[id2],
            self.main_pic_embs[id2],
            torch.tensor(self.name_bert_embs[id2]),

            target
        )

In [None]:
class MultiModalNet(pl.LightningModule):
    margin = 0.75

    def __init__(self):
        super(MultiModalNet, self).__init__()

        # attrs
        self.category_embedding = nn.Embedding(
            num_embeddings=len(categories_map),
            embedding_dim=len(categories_map) // 2,
            padding_idx=None
        )

        self.color_embedding = nn.Embedding(
            num_embeddings=len(color_vocab) + 2,
            embedding_dim=(len(color_vocab) + 2) // 2,
            padding_idx=0
        )
        self.color_lstm_hidden = 64
        self.color_lstm = nn.LSTM(
            input_size=(len(color_vocab) + 2) // 2,
            hidden_size=self.color_lstm_hidden,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )

        # name
        self.LaBSE_tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
        self.LaBSE_model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru')
        self.LaBSE_fc = nn.Linear(768, 768)

        # net
        input_size = len(categories_map) // 2 + 2*self.color_lstm_hidden + 768 + 128 + 64
        output_size = 768
        self.bn = nn.BatchNorm1d(input_size)
        self.embedding_dropout = nn.Dropout(p=0.05)

        deberta_cfg = DebertaV2Config(
            hidden_size=input_size,
            num_hidden_layers=1,
            num_attention_heads=1,
            intermediate_size=1024,
        )
        self.deberta = DebertaV2Model(deberta_cfg, ).encoder

        features_num = 2 * input_size
        embedding_size = (features_num + output_size) // 2
        self.neck = nn.Sequential(
            nn.BatchNorm1d(features_num),
            nn.Linear(features_num, embedding_size, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(embedding_size),
            nn.Linear(embedding_size, embedding_size, bias=False),
            nn.BatchNorm1d(embedding_size),
        )

        self.output_layer = nn.Linear(embedding_size, output_size)

    def forward(self, categories, colors, names, pic_embs, name_bert_embs):
        categories_output = self.category_embedding(categories)

        colors_emb = self.color_embedding(colors)
        output, (ht, ct) = self.color_lstm(colors_emb)
        out_forward = output[:, -1, :self.color_lstm_hidden]
        out_reverse = output[:, 0, self.color_lstm_hidden:]
        colors_output = torch.cat([out_forward, out_reverse], 1)

        encoded_input = self.LaBSE_tokenizer(
            names, padding=True, truncation=True, max_length=256, return_tensors='pt'
        ).to('cuda')
        model_output = self.LaBSE_model(**encoded_input)
        embeddings = torch.nn.functional.normalize(model_output.pooler_output)
        names_output = self.LaBSE_fc(embeddings)

        pics_output = torch.nn.functional.normalize(pic_embs)

        names_bert_output = torch.nn.functional.normalize(name_bert_embs)

        x = torch.cat([categories_output, colors_output, names_output, pics_output, names_bert_output], dim=1)
        x = self.bn(x)
        x = self.embedding_dropout(x)
        x = x.unsqueeze(1)
        attention_mask = torch.ones((x.shape[0], 1), device='cuda')
        last_hidden = self.deberta(x, attention_mask)
        last_hidden = torch.concat([last_hidden[0].mean(1), last_hidden[0].max(1)[0]], -1)
        outputs = self.neck(last_hidden)
        outputs = self.output_layer(outputs)
        outputs = torch.nn.functional.normalize(outputs)
        return outputs

    def predict_step(self, batch, batch_idx):
        categories1, colors1, names1, pic_embs1, name_bert_embs1,\
        categories2, colors2, names2, pic_embs2, name_bert_embs2,\
        labels = batch
        out1 = self.forward(categories1, colors1, names1, pic_embs1, name_bert_embs1)
        out2 = self.forward(categories2, colors2, names2, pic_embs2, name_bert_embs2)

        dists = nn.PairwiseDistance()(out1, out2)
        return torch.cat([out1, out2, (1 - dists).unsqueeze(-1)], dim=1).detach().cpu()

In [None]:
model = MultiModalNet()

In [None]:
model.load_state_dict(torch.load(PATH_TO_MULTIMODAL, map_location=torch.device('cpu')))

In [None]:
checkpoint_cb = ModelCheckpoint(
    dirpath='./MultiModal/', filename='products-{epoch:02d}-{val_auc:.4f}-normalize', monitor='val_auc', mode='max'
)

trainer = pl.Trainer(
    logger=False, # CSVLogger('./'),
    enable_checkpointing=True,
    callbacks=[checkpoint_cb],
    accelerator='gpu',
    devices=[0],
    profiler='advanced',
    precision="16-mixed",
    check_val_every_n_epoch=1,
    max_epochs=args.epochs
)

In [None]:
test_pairs['target'] = -1
test_pairs = test_pairs[['target', 'variantid1', 'variantid2']]
test_dataset = ItemsDataset(test_pairs, test_data)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=args.batch_size,
    num_workers=0,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [None]:
test_features = np.concatenate([pred.numpy() for pred in trainer.predict(model, test_loader)])

In [None]:
multimodal_tuned_768 = pd.Series(index=test_data.index, dtype='object', name='multimodal_tuned_768')
multimodal_tuned_768[test_pairs.variantid1] = list(test_features[:, :768])
multimodal_tuned_768[test_pairs.variantid2] = list(test_features[:, 768:768*2])
multimodal_tuned_768.to_pickle('./datasets/multimodal_tuned_test.pickle')

# Boosting part

In [143]:
pd.set_option('display.max_columns', 256)
warnings.filterwarnings('ignore')

In [144]:
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
rtest_pairs = test_pairs.copy()
rtest_pairs.variantid1, rtest_pairs.variantid2 = rtest_pairs.variantid2, rtest_pairs.variantid1
test_pairs

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919
2,77243372,479860557
3,86065820,540678372
4,91566575,258840506
...,...,...
18079,666998614,667074522
18080,670036240,670048449
18081,670284509,684323809
18082,692172005,704805270


In [145]:
train_data = pd.read_parquet('./datasets/train_data.parquet').set_index('variantid')
train_data['characteristic_attributes_mapping'] = train_data['characteristic_attributes_mapping'].fillna('{}').apply(lambda x: json.loads(x))
train_data['categories'] = train_data['categories'].apply(lambda x: json.loads(x))
train_data['main_pic_embeddings_resnet_v1'] = train_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
train_data

Unnamed: 0_level_0,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{'1': 'EPG', '2': 'Электроника', '3': 'Сетевые...",[оранжевый],,"[0.04603629, 0.18839523, -0.09973055, -0.66368...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{'Номинальный ток, А': ['10'], 'Цвет товара': ..."
53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{'1': 'EPG', '2': 'Электроника', '3': 'Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[1.1471839, -0.665361, 0.7745614, 0.26716197, ...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...",{'Конструктивные особенности': ['Магнитная кон...
56763357,"Набор микропрепаратов Konus 25: ""Клетки и ткан...","{'1': 'EPG', '2': 'Электроника', '3': 'Оптичес...",,"[[0.66954195, 1.0643557, 0.78324044, -0.338267...","[-0.90570974, 1.0296293, 1.0769907, 0.27746, -...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719...","{'Тип аксессуара': ['Набор микропрепаратов'], ..."
56961772,"Мобильный телефон BQ 1848 Step, черный","{'1': 'EPG', '2': 'Электроника', '3': 'Смартфо...",[черный],"[[0.6580482, -0.35763323, -0.16939065, -0.4249...","[0.13133773, -0.5577079, 0.32498044, 0.1917174...","[-0.44812852, 0.5283565, 0.28981736, -0.506841...","{'Тип карты памяти': ['microSD'], 'Число SIM-к..."
61054740,"Штатив трипод Tripod 330A для фотоаппаратов, в...","{'1': 'EPG', '2': 'Электроника', '3': 'Штативы...",[черный],"[[-0.10406649, 0.080646515, -0.28668788, 0.739...","[0.21696381, 0.10989461, -0.08012986, 0.691861...","[-0.72692573, 0.75206333, 0.37740713, -0.52502...","{'Материал': ['Металл'], 'Количество секций, ш..."
...,...,...,...,...,...,...,...
820128810,"Комплект 2 шт, Чернила Cactus CS-EPT6733B пурп...","{'1': 'EPG', '2': 'Электроника', '3': 'Расходн...",[пурпурный],,"[-1.4492652, -0.80129164, -0.12344764, 0.71945...","[-0.8253241, 0.6785133, 0.53978086, -0.4888316...","{'Тип': ['Чернила для принтера'], 'Бренд печат..."
821135769,"Защитное стекло закаленное Xiaomi Redmi 7, Y3 ...","{'1': 'EPG', '2': 'Электроника', '3': 'Защитны...",[черный],"[[0.09564891, 0.27437285, -0.19054827, -0.7992...","[0.012127608, -0.8534423, 0.5415518, -0.449125...","[-0.7413257, 0.46105132, 0.5639801, -0.5462132...","{'Вид стекла': ['3D'], 'Тип': ['Защитное стекл..."
822095690,Системный блок ЮКОМС 9400-268 (AMD A6-9400 (3....,"{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],,"[0.4248176, -0.15944786, -0.22844064, 0.427686...","[-0.49261805, 0.56726897, 0.7037877, -0.697246...","{'Общий объем HDD, ГБ': ['10000'], 'Видеокарта..."
822101044,Системный блок ЮКОМС 9400-9 (AMD A6-9400 (3.4 ...,"{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],,"[0.4248176, -0.15944786, -0.22844064, 0.427686...","[-0.44051006, 0.54029673, 0.63768685, -0.68040...","{'Общий объем HDD, ГБ': ['8000'], 'Видеокарта'..."


In [146]:
test_data = pd.read_parquet('./datasets/test_data.parquet').set_index('variantid')
test_data['characteristic_attributes_mapping'] = test_data['characteristic_attributes_mapping'].fillna('{}').apply(lambda x: json.loads(x))
test_data['categories'] = test_data['categories'].apply(lambda x: json.loads(x))
test_data['main_pic_embeddings_resnet_v1'] = test_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
test_data = pd.concat([test_data, pd.read_parquet('./datasets/name_labse_embs_test.parquet').set_index('variantid')], axis=1)
test_data = pd.concat([test_data, pd.read_pickle('./datasets/labse_tuned_test.pickle')], axis=1)
test_data = pd.concat([test_data, pd.read_pickle('./datasets/multimodal_tuned_test.pickle')], axis=1)
test_data

Unnamed: 0_level_0,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_labse_768,labse_tuned_768,multimodal_tuned_768
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
51201254,Колодка TDM Electric четырехместная без заземл...,"{'1': 'EPG', '2': 'Электроника', '3': 'Сетевые...",[белый],"[[0.34383398, -0.2962618, 0.07987049, -0.08257...","[0.38310742, -0.7876679, 0.5018278, 0.20900711...","[-0.5060825, 0.5773388, 0.59435517, -0.4958292...","{'Страна-изготовитель': ['Китай'], 'Бренд': ['...","[-0.0058242553, 0.0010011946, 0.015051351, 0.0...","[-0.0036340018, 0.05191699, -0.019618921, 0.03...","[0.022621285, 0.0024626048, -0.023789462, 0.00..."
77151532,Клавиатура черная с черной рамкой для 25-011879,"{'1': 'EPG', '2': 'Электроника', '3': 'Запчаст...",[черный],,"[0.50964713, 0.7958329, -1.4113188, 0.19993813...","[-0.43467724, 0.6614495, 0.48050267, -0.588880...","{'Страна-изготовитель': ['Китай'], 'Комплектац...","[0.0088402, -0.0050699823, 0.026550002, -0.015...","[0.020562124, -0.027739627, 0.003007357, 0.033...","[-0.00923243, -0.0176388, -0.04956357, -0.0119..."
89664856,"15.6"" Игровой ноутбук Acer Predator Helios 300...","{'1': 'EPG', '2': 'Электроника', '3': 'Компьют...",[черный],"[[0.7804302, -0.245446, -0.67754817, -0.614691...","[0.9958085, -0.113175124, -0.7623152, -0.91648...","[-0.70010763, 0.48152006, 0.47597092, -0.51727...",{'Видеокарта': ['NVIDIA GeForce RTX 2070 (8 Гб...,"[-0.026623247, -0.018851712, 0.011397564, -0.0...","[0.008961296, 0.0074943216, -0.0042574126, 0.0...","[-0.058446273, 0.047987167, -0.0066078017, -0...."
90701982,Портативная колонка Borofone BR7 Empyreal Spor...,"{'1': 'EPG', '2': 'Электроника', '3': 'Акустик...","[red, красный]","[[-0.24636984, -1.0719914, -0.49986655, 0.3423...","[-0.26596686, -1.143009, -0.5289628, 0.4285588...","[-0.73135185, -0.039796613, 0.38907066, -0.496...","{'Основной материал корпуса': ['Металл'], 'Мак...","[0.016539363, 0.03778109, 0.025718935, 0.08805...","[0.0002899492, 0.00082552544, -0.0036707376, 0...","[0.036278114, -0.025334716, 0.014591197, 0.025..."
92484118,Аккумулятор для Meizu BA712 ( M6s ),"{'1': 'EPG', '2': 'Электроника', '3': 'Батарей...",,,"[0.42047608, 0.75828516, 0.5440093, -0.0068945...","[-0.600158, 0.13944691, 0.48706242, -0.5050975...","{'Рекомендовано для': ['Meizu'], 'Бренд': ['Me...","[-0.0024493372, 0.02346121, 0.068452105, 0.023...","[0.0523341, -0.028439298, -0.0205805, 0.019437...","[0.05139726, -0.008842086, 0.020835716, 0.0234..."
...,...,...,...,...,...,...,...,...,...,...
702785891,Кабель USB - Lightning HOCO X21 PLUS (черно-бе...,"{'1': 'EPG', '2': 'Электроника', '3': 'Кабели ...",[черный],"[[1.1820095, -0.16312826, 1.4916217, 0.0288323...","[0.3297959, -0.16444838, 0.9350716, 0.34787956...","[-0.66597974, 0.7140731, 0.43572947, -0.445908...","{'Бренд': ['hoco'], 'Тип': ['Кабель'], 'Цвет т...","[-0.031527344, -0.06875799, 0.03187686, -0.004...","[0.015837612, -0.020126382, 0.01076397, 0.0433...","[0.022473004, -0.049976066, 0.0054458496, 0.03..."
704096517,Блок питания для ноутбука Asus f5gl (19V 90W 4...,"{'1': 'EPG', '2': 'Электроника', '3': 'Зарядны...",[черный],"[[-0.013610864, -0.68512607, 0.77639246, -1.04...","[0.2785852, -0.16053033, 1.1653559, 1.0619084,...","[-0.7575411, 0.4196694, 0.46428213, -0.4916808...","{'Комплектация': ['Зарядное устройство, сетево...","[-0.023706086, -0.012301952, -0.014316322, -0....","[-0.0046698838, -0.00089860754, -0.025120754, ...","[0.031209752, -0.024088062, 0.011526196, 0.009..."
705874953,Оперативная память HyperX FURY Black DDR4 2666...,"{'1': 'EPG', '2': 'Электроника', '3': 'Операти...",[black],"[[0.34073856, 0.65070343, 0.31146732, 1.261663...","[0.31382418, 0.60041714, 0.3067428, 1.1233345,...","[-0.60506856, 0.4477128, 0.62255704, -0.720129...","{'Тайминги': ['16-18-18-29'], 'Пропускная спос...","[-0.028754005, -0.025122717, 0.048854, -0.0297...","[-0.012161249, 0.049054068, -0.02890747, 0.063...","[-0.0119654825, -0.0031715138, -0.056217935, -..."
706965102,8 ТБ Внутренний жесткий диск Toshiba TOSHIBA N...,"{'1': 'EPG', '2': 'Электроника', '3': 'Жесткие...",,"[[-0.9360045, -0.43083164, -1.1651772, 1.23836...","[0.404035, -0.20071658, -0.44533533, 0.2038879...","[-0.62029105, 0.45747545, 0.6659858, -0.671704...","{'Комплектация': ['HDWG480UZSVA'], 'Форм-факто...","[-0.026827315, 0.032079216, 0.040149417, -0.01...","[-0.021119665, 0.045512594, -0.011890485, 0.03...","[-0.07998978, 0.016373454, -0.013044185, 0.003..."


In [147]:
both_cat3 = np.load('./models/both_cat3.npy', allow_pickle=True).tolist()
both_cat4 = np.load('./models/both_cat4.npy', allow_pickle=True).tolist()
colors_mapper = np.load('./models/colors_mapper.npy', allow_pickle=True).tolist()
imp_attrs = np.load('./models/imp_attrs.npy', allow_pickle=True).tolist()
num_attrs = np.load('./models/num_attrs.npy', allow_pickle=True).tolist()
best_num_attrs = np.load('./models/best_num_attrs.npy', allow_pickle=True).tolist()
best_attrs = np.load('./models/best_attrs.npy', allow_pickle=True).tolist()

In [148]:
nums_tokenizer = RegexpTokenizer(r'\d+[.]\d+|\d+')
tokenizer = RegexpTokenizer(r'\d+[ ]+\d+[ ]+\d+|\d+[ ]+\d+|[a-zA-Z]+[.]+[a-zA-Z]+|[A-Z]+[a-z]+|\d+[.,:+-]+\d+|\w+')
clar_tokenizer = RegexpTokenizer(r"\([^()]+\)")

In [149]:
def calc_dists(df, prefix, embs_1, embs_2):
    l1_dists, l2_dists, cos_dists = [], [], []
    for emb_1, emb_2 in zip(embs_1, embs_2):
        len_1 = (emb_1**2).sum()**0.5
        len_2 = (emb_2**2).sum()**0.5
        l1_dists.append(
            np.abs(emb_1 - emb_2).sum()
        )
        l2_dists.append(
            ((emb_1 - emb_2)**2).sum()**0.5
        )
        cos_dists.append(
            (emb_1 @ emb_2) / len_1 / len_2
        )
    df[f'{prefix}_l1_dist'] = l1_dists
    df[f'{prefix}_l2_dist'] = l2_dists
    df[f'{prefix}_cos_dist'] = cos_dists

In [150]:
def make_categories_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    categories_1 = data.loc[pairs.variantid1, 'categories']
    categories_2 = data.loc[pairs.variantid2, 'categories']
    df['cat3'] = categories_1.apply(lambda x: x['3']).values
    df.loc[~df.cat3.isin(both_cat3), 'cat3'] = 'rest'
    df['cat4'] = categories_1.apply(lambda x: x['4']).values
    df.loc[~df.cat4.isin(both_cat4), 'cat4'] = 'rest'
    df['is_eq_cat4'] = categories_1.apply(lambda x: x['4']).values == categories_2.apply(lambda x: x['4']).values
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [151]:
def make_colors_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    colors_1 = data.loc[pairs.variantid1, 'color_parsed']
    colors_2 = data.loc[pairs.variantid2, 'color_parsed']
    same_colors = []
    all_colors = []
    for color_1, color_2 in zip(colors_1, colors_2):
        if color_1 is not None:
            color_1 = list(set([colors_mapper[c] for c in color_1]))
        if color_2 is not None:
            color_2 = list(set([colors_mapper[c] for c in color_2]))
        
        if color_1 is None or color_2 is None:
            same_colors.append(0)
            if color_1 is not None:
                all_colors.append(len(color_1))
            elif color_2 is not None:
                all_colors.append(len(color_2))
            else:
                all_colors.append(0)
        else:
            same_colors.append(
                len(set(color_1) & set(color_2))
            )
            all_colors.append(
                len(set(color_1) | set(color_2))
            )
    df['same_colors'] = same_colors
    df['all_colors'] = all_colors
    df['iou_colors'] = df['same_colors'] / df['all_colors']
    df.loc[df['all_colors']==0, 'iou_colors'] = 0
    df['not_same_colors'] = df['all_colors'] - df['same_colors']
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [152]:
def make_pictures_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    main_pics_1 = data.loc[df.variantid1, 'main_pic_embeddings_resnet_v1'].values
    main_pics_2 = data.loc[df.variantid2, 'main_pic_embeddings_resnet_v1'].values
    calc_dists(
        df, 'main_pic', 
        main_pics_1,
        main_pics_2
    )
    embs_1 = data.loc[df.variantid1, 'pic_embeddings_resnet_v1'].values
    embs_2 = data.loc[df.variantid2, 'pic_embeddings_resnet_v1'].values
    min_dists, mean_dists, max_dists, std_dists = [], [], [], []
    pic_cnts_1, pic_cnts_2 = [], []
    mean_dists_to_main_1, mean_dists_to_main_2 = [], []
    for main_pic_1, main_pic_2, emb_1, emb_2 in tqdm(zip(main_pics_1, main_pics_2, embs_1, embs_2), total=len(df)):
        dists = []
        pics_1 = [main_pic_1]
        pics_2 = [main_pic_2]
        if emb_1 is not None:
            pics_1.extend(list(emb_1))
            pic_cnts_1.append(len(emb_1))
        else:
            pic_cnts_1.append(0)
        if emb_2 is not None:
            pics_2.extend(list(emb_2))
            pic_cnts_2.append(len(emb_2))
        else:
            pic_cnts_2.append(0)
        for pic_1 in pics_1:
            for pic_2 in pics_2:
                dists.append(
                    (pic_1 @ pic_2) / (pic_1**2).sum()**0.5 / (pic_2**2).sum()**0.5
                )
        min_dists.append(np.min(dists))
        mean_dists.append(np.mean(dists))
        max_dists.append(np.max(dists))
        std_dists.append(np.std(dists))
        
        dists = []
        for pic_2 in pics_2:
            dists.append(
                (main_pic_1 @ pic_2) / (main_pic_1**2).sum()**0.5 / (pic_2**2).sum()**0.5
            )
        mean_dists_to_main_1.append(np.mean(dists))
        dists = []
        for pic_1 in pics_1:
            dists.append(
                (pic_1 @ main_pic_2) / (pic_1**2).sum()**0.5 / (main_pic_2**2).sum()**0.5
            )
        mean_dists_to_main_2.append(np.mean(dists))
    df['pic_cnt_1'] = pic_cnts_1
    df['pic_cnt_2'] = pic_cnts_2
    df['pic_cnt_diff'] = np.abs(df['pic_cnt_1'] - df['pic_cnt_2'])
    df['pics_min_dist'] = min_dists
    df['pics_mean_dist'] = mean_dists
    df['pics_max_dist'] = max_dists
    df['pics_std_dist'] = std_dists
    df['pics_diff_dist'] = df['pics_max_dist'] - df['pics_min_dist']
    df['mean_dist_to_main_1'] = mean_dists_to_main_1
    df['mean_dist_to_main_2'] = mean_dists_to_main_2
    df['mean_dist_to_main_diff'] = np.abs(df['mean_dist_to_main_1'] - df['mean_dist_to_main_2'])
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [153]:
def make_names_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    calc_dists(
        df, 'name_bert', 
        data.loc[df.variantid1, 'name_bert_64'],
        data.loc[df.variantid2, 'name_bert_64']
    )
    calc_dists(
        df, 'name_labse', 
        data.loc[df.variantid1, 'name_labse_768'],
        data.loc[df.variantid2, 'name_labse_768']
    )
    
    names_1 = data.loc[pairs.variantid1, 'name']
    names_2 = data.loc[pairs.variantid2, 'name']
    dist, partial_dist, token_sort_dist, token_set_dist = [], [], [], []
    for name_1, name_2 in zip(names_1, names_2):
        dist.append(
            fuzz.ratio(name_1, name_2)
        )
        partial_dist.append(
            fuzz.partial_ratio(name_1, name_2)
        )
        token_sort_dist.append(
            fuzz.token_sort_ratio(name_1, name_2)
        )
        token_set_dist.append(
            fuzz.token_set_ratio(name_1, name_2)
        )
    df['name_dist'] = dist
    df['name_partial_dist'] = partial_dist
    df['name_token_sort_dist'] = token_sort_dist
    df['name_token_set_dist'] = token_set_dist
    
    same_words = []
    all_words = []
    for name_1, name_2 in zip(names_1, names_2):
        words_1 = set(name_1.split())
        words_2 = set(name_2.split())
        same_words.append(len(words_1 & words_2))
        all_words.append(len(words_1 | words_2))
    df['same_words'] = same_words
    df['all_words'] = all_words
    df['iou_words'] = df['same_words'] / df['all_words']
    df.loc[df['all_words']==0, 'iou_words'] = 0
    df['not_same_words'] = df['all_words'] - df['same_words']
    
    same_nums = []
    all_nums = []
    for name_1, name_2 in zip(names_1, names_2):
        # nums_1 = set(re.sub(r'\D+', ' ', name_1).split())
        # nums_2 = set(re.sub(r'\D+', ' ', name_2).split())
        nums_1 = set(nums_tokenizer.tokenize(name_1))
        nums_2 = set(nums_tokenizer.tokenize(name_2))
        same_nums.append(len(nums_1 & nums_2))
        all_nums.append(len(nums_1 | nums_2))
    df['same_nums'] = same_nums
    df['all_nums'] = all_nums
    df['iou_nums'] = df['same_nums'] / df['all_nums']
    df.loc[df['all_nums']==0, 'iou_nums'] = 1
    df['not_same_nums'] = df['all_nums'] - df['same_nums']
    
    df['name_len_1'] = names_1.apply(lambda x: len(x)).values
    df['name_len_2'] = names_2.apply(lambda x: len(x)).values
    df['name_words_1'] = names_1.apply(lambda x: len(x.split())).values
    df['name_words_2'] = names_2.apply(lambda x: len(x.split())).values
    df['name_digit_cnt_1'] = names_1.apply(lambda x: np.sum(['0' <= letter <= '9' for letter in x])).values
    df['name_digit_cnt_2'] = names_2.apply(lambda x: np.sum(['0' <= letter <= '9' for letter in x])).values
    df['name_eng_cnt_1'] = names_1.apply(lambda x: np.sum(['a' <= letter <= 'z' for letter in x.lower()])).values
    df['name_eng_cnt_2'] = names_2.apply(lambda x: np.sum(['a' <= letter <= 'z' for letter in x.lower()])).values
    df['name_rus_cnt_1'] = names_1.apply(lambda x: np.sum(['а' <= letter <= 'я' or letter=='ё' for letter in x.lower()])).values
    df['name_rus_cnt_2'] = names_2.apply(lambda x: np.sum(['а' <= letter <= 'я' or letter=='ё' for letter in x.lower()])).values
    df['name_upper_cnt_1'] = names_1.apply(lambda x: np.sum([letter.isupper() for letter in x])).values
    df['name_upper_cnt_2'] = names_2.apply(lambda x: np.sum([letter.isupper() for letter in x])).values
    for feature in ('len', 'words', 'digit_cnt', 'eng_cnt', 'rus_cnt', 'upper_cnt'):
        df[f'name_{feature}_diff'] = np.abs(df[f'name_{feature}_1'] - df[f'name_{feature}_2'])
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [154]:
def make_attributes_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    attrs_1 = data.loc[pairs.variantid1, 'characteristic_attributes_mapping']
    attrs_2 = data.loc[pairs.variantid2, 'characteristic_attributes_mapping']
    same_keys, all_keys = [], []
    same_values, same_values_dist = [], []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        same_keys.append(len(attr_1.keys() & attr_2.keys()))
        all_keys.append(len(attr_1.keys() | attr_2.keys()))
        count, dist = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            count += attr_1[key] == attr_2[key]
            dist += fuzz.ratio(attr_1[key], attr_2[key])
        same_values.append(count)
        same_values_dist.append(dist / 100.)
    df['same_keys'] = same_keys
    df['all_keys'] = all_keys
    df['iou_keys'] = df['same_keys'] / df['all_keys']
    df.loc[df['all_keys']==0, 'iou_keys'] = 0
    df['not_same_keys'] = df['all_keys'] - df['same_keys']
    df['same_values'] = same_values
    df['same_values_ratio'] = df['same_values'] / df['same_keys'] 
    df.loc[df['same_keys']==0, 'same_values_ratio'] = 0
    df['same_values_dist'] = same_values_dist
    df['same_values_dist_ratio'] = df['same_values_dist'] / df['same_keys'] 
    df.loc[df['same_keys']==0, 'same_values_dist_ratio'] = 0
    
    imp_neq_cnt, imp_cnt = [], []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        keys = attr_1.keys() & attr_2.keys() & imp_attrs
        cnt = 0
        for key in keys:
            if len(set(attr_1[key]) & set(attr_2[key])) == 0:
                cnt += 100 - fuzz.token_set_ratio(attr_1[key], attr_2[key])
        imp_neq_cnt.append(cnt)
        imp_cnt.append(len(keys))
    df['imp_neq_cnt'] = imp_neq_cnt
    df['imp_cnt'] = imp_cnt
    df['imp_eq_cnt'] = df['imp_cnt'] * 100 - df['imp_neq_cnt']
    df['neq/imp_cnt'] = df['imp_neq_cnt'] / df['imp_cnt']
    df.loc[df['imp_cnt'] == 0, 'neq/imp_cnt'] = 0
    
    attr_same_nums = []
    attr_all_nums = []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        p, q = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            nums_1 = set(nums_tokenizer.tokenize(' '.join(attr_1[key])))
            nums_2 = set(nums_tokenizer.tokenize(' '.join(attr_2[key])))
            p += len(nums_1 & nums_2)
            q += len(nums_1 | nums_2)
        attr_same_nums.append(p)
        attr_all_nums.append(q)
    df['attr_same_nums'] = attr_same_nums
    df['attr_all_nums'] = attr_all_nums
    df['attr_iou_nums'] = df['attr_same_nums'] / df['attr_all_nums']
    df.loc[df['attr_all_nums']==0, 'attr_iou_nums'] = 1
    df['attr_not_same_nums'] = df['attr_all_nums'] - df['attr_same_nums']  
    
    attr_same_words = []
    attr_all_words = []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        p, q = 0, 0
        for key in attr_1.keys() & attr_2.keys():
            words_1 = set(' '.join(attr_1[key]).split())
            words_2 = set(' '.join(attr_2[key]).split())
            p += len(words_1 & words_2)
            q += len(words_1 | words_2)
        attr_same_words.append(p)
        attr_all_words.append(q)
    df['attr_same_words'] = attr_same_words
    df['attr_all_words'] = attr_all_words
    df['attr_iou_words'] = df['attr_same_words'] / df['attr_all_words']
    df.loc[df['attr_all_words']==0, 'attr_iou_words'] = 1
    df['attr_not_same_words'] = df['attr_all_words'] - df['attr_same_words']  
    
    num_attrs_smape_sum = []
    num_attrs_total = []
    for attr_1, attr_2 in tqdm(zip(attrs_1, attrs_2), total=len(df)):
        cur = 0
        keys = attr_1.keys() & attr_2.keys() & num_attrs
        for key in keys:
            num_1 = float(nums_tokenizer.tokenize(attr_1[key][0])[0])
            num_2 = float(nums_tokenizer.tokenize(attr_2[key][0])[0])
            cur += 2 * np.abs(num_1 - num_2) / (num_1 + num_2 + 1e-9)
        num_attrs_smape_sum.append(cur)
        num_attrs_total.append(len(keys))
    df['num_attrs_smape_sum'] = num_attrs_smape_sum
    df['num_attrs_total'] = num_attrs_total
    df['num_attrs_smape_mean'] = df['num_attrs_smape_sum'] / df['num_attrs_total']
    df.loc[df['num_attrs_total']==0, 'num_attrs_smape_mean'] = 2
    
    for attr in tqdm(best_num_attrs): 
        values = []
        for attr_1, attr_2 in zip(attrs_1, attrs_2):
            if attr not in attr_1 or attr not in attr_2:
                values.append(-(attr not in attr_1)-(attr not in attr_2))
            else:
                values.append(np.abs(
                    float(nums_tokenizer.tokenize(attr_1[attr][0])[0]) - float(nums_tokenizer.tokenize(attr_2[attr][0])[0])
                ))
        df[f'diff_{attr}'] = values
        
    for attr in tqdm(best_attrs): 
        values = []
        for attr_1, attr_2 in zip(attrs_1, attrs_2):
            if attr not in attr_1 or attr not in attr_2:
                values.append(-(attr not in attr_1)-(attr not in attr_2))
            else:
                text_1 = ' '.join(attr_1[attr])
                text_1 = tokenizer.tokenize(text_1)
                text_1 = ' '.join(text_1).lower()
                text_2 = ' '.join(attr_2[attr])
                text_2 = tokenizer.tokenize(text_2)
                text_2 = ' '.join(text_2).lower()
                values.append(fuzz.token_set_ratio(text_1, text_2))
        df[f'fuzz_{attr}'] = values
    
    cat_features = [
        'Страна-изготовитель', 'Оперативная память', 'Бренд процессора', 'Модуль связи Bluetooth', 'Назначение'
    ]
    fill_values = ['не указана', '', '', '', 'нет', '']
    for feature, fill_value in tqdm(zip(cat_features, fill_values), total=len(cat_features)):
        train_values = set()
        for attrs in train_data.characteristic_attributes_mapping:
            train_values.add(attrs.get(feature, [fill_value])[0].lower())
        test_values = set()
        for attrs in test_data.characteristic_attributes_mapping:
            test_values.add(attrs.get(feature, [fill_value])[0].lower())
        both_values = train_values&test_values
        values_1 = []
        for attrs in attrs_1:
            value = attrs.get(feature, [fill_value])[0].lower()
            if value in both_values:
                values_1.append(value)
            else:
                values_1.append('другое')
        df[f'{feature}_1'] = values_1  
        values_2 = []
        for attrs in attrs_2:
            value = attrs.get(feature, [fill_value])[0].lower()
            if value in both_values:
                values_2.append(value)
            else:
                values_2.append('другое')
        df[f'{feature}_2'] = values_2
        
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [155]:
def make_embeddings_features(pairs, data):
    gc.collect()
    df = pairs.copy()
    df['main_pic_1'] = list(data.loc[pairs.variantid1, 'main_pic_embeddings_resnet_v1'])
    df['main_pic_2'] = list(data.loc[pairs.variantid2, 'main_pic_embeddings_resnet_v1'])
    df['name_bert_1'] = list(data.loc[pairs.variantid1, 'name_bert_64'])
    df['name_bert_2'] = list(data.loc[pairs.variantid2, 'name_bert_64'])
    df['name_labse_1'] = list(data.loc[pairs.variantid1, 'name_labse_768'])
    df['name_labse_2'] = list(data.loc[pairs.variantid2, 'name_labse_768'])
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [156]:
test_df_categories = make_categories_features(test_pairs, test_data)
test_df_color = make_colors_features(test_pairs, test_data)
test_df_pictures = make_pictures_features(test_pairs, test_data)
test_df_names = make_names_features(test_pairs, test_data)
test_df_attributes = make_attributes_features(test_pairs, test_data)
test_df_embeddings = make_embeddings_features(test_pairs, test_data)

test_df = pd.concat([
    test_df_categories,
    test_df_color,
    test_df_pictures,
    test_df_names,
    test_df_attributes,
    test_df_embeddings
], axis=1)

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?it/s]

  0%|          | 0/457 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [157]:
rtest_df_categories = make_categories_features(rtest_pairs, test_data)
rtest_df_color = make_colors_features(rtest_pairs, test_data)
rtest_df_pictures = make_pictures_features(rtest_pairs, test_data)
rtest_df_names = make_names_features(rtest_pairs, test_data)
rtest_df_attributes = make_attributes_features(rtest_pairs, test_data)
rtest_df_embeddings = make_embeddings_features(rtest_pairs, test_data)

rtest_df = pd.concat([
    rtest_df_categories,
    rtest_df_color,
    rtest_df_pictures,
    rtest_df_names,
    rtest_df_attributes,
    rtest_df_embeddings
], axis=1)

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?it/s]

  0%|          | 0/457 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [158]:
cat_features = [
    'cat3', 'cat4',
    'Страна-изготовитель_1', 'Страна-изготовитель_2', 
    'Оперативная память_1', 'Оперативная память_2',
    'Бренд процессора_1', 'Бренд процессора_2',
    'Модуль связи Bluetooth_1', 'Модуль связи Bluetooth_2',
    'Назначение_1', 'Назначение_2',
]
embedding_features = ['main_pic_1', 'main_pic_2', 'name_bert_1', 'name_bert_2', 'name_labse_1', 'name_labse_2']

In [159]:
model_cb_pseudo = CatBoostClassifier().load_model('./models/golden_model_cb_pseudo.cbm')

In [160]:
test_pool = Pool(
    data=test_df.drop(embedding_features, axis = 1)[model_cb_pseudo.feature_names_],
    cat_features=cat_features,
)
rtest_pool = Pool(
    data=rtest_df.drop(embedding_features, axis = 1)[model_cb_pseudo.feature_names_],
    cat_features=cat_features,
)

In [161]:
test_pairs['cb_pseudo_pred'] = rtest_pairs['cb_pseudo_pred'] = \
(model_cb_pseudo.predict_proba(test_pool)[:, 1] + model_cb_pseudo.predict_proba(rtest_pool)[:, 1]) / 2.

In [162]:
model_cb = CatBoostClassifier().load_model('./models/golden_model_cb.cbm')

In [163]:
test_pool = Pool(
    data=test_df[model_cb.feature_names_],
    cat_features=cat_features,
    embedding_features=embedding_features
)
rtest_pool = Pool(
    data=rtest_df[model_cb.feature_names_],
    cat_features=cat_features,
    embedding_features=embedding_features
)

In [164]:
test_pairs['cb_pred'] = rtest_pairs['cb_pred'] = \
(model_cb.predict_proba(test_pool)[:, 1] + model_cb.predict_proba(rtest_pool)[:, 1]) / 2.

In [165]:
prev_cols = []
for col in test_df.columns:
    if col.startswith('main_pic_1'):
        break
    prev_cols.append(col)
len(prev_cols)

690

In [166]:
test_pairs = pd.concat([test_pairs, test_df[prev_cols]], axis=1)
rtest_pairs = pd.concat([rtest_pairs, rtest_df[prev_cols]], axis=1)

In [167]:
def make_features(pairs, data):
    gc.collect()
    df = pairs.copy()   
    df['cb_mean_pred'] = (df['cb_pseudo_pred'] + df['cb_pred']) / 2
    
    # multi modal
    multimodal_tuned_1 = data.loc[df.variantid1, 'multimodal_tuned_768'].values
    multimodal_tuned_2 = data.loc[df.variantid2, 'multimodal_tuned_768'].values
    calc_dists(
        df, 'multimodal_tuned', 
        multimodal_tuned_1,
        multimodal_tuned_2
    )
    
    # labse
    labse_tuned_1 = data.loc[df.variantid1, 'labse_tuned_768'].values
    labse_tuned_2 = data.loc[df.variantid2, 'labse_tuned_768'].values
    calc_dists(
        df, 'labse_tuned', 
        labse_tuned_1,
        labse_tuned_2
    )
    
    # fix
    names_1 = data.loc[pairs.variantid1, 'name']
    names_2 = data.loc[pairs.variantid2, 'name']
    colors_1 = data.loc[pairs.variantid1, 'color_parsed']
    colors_2 = data.loc[pairs.variantid2, 'color_parsed']
    
    same_colors = []
    all_colors = []
    for color_1, color_2, name_1, name_2 in tqdm(zip(colors_1, colors_2, names_1, names_2), total=len(df)):
        if color_1 is None:
            color_1 = []
        if color_2 is None:
            color_2 = []
            
        color_1 = set([colors_mapper[c] for c in color_1])
        color_2 = set([colors_mapper[c] for c in color_2])
        for color in colors_mapper:
            if color + ' ' in name_1 or ' ' + color in name_1:
                color_1.add(colors_mapper[color])
            if color + ' ' in name_2 or ' ' + color in name_2:
                color_2.add(colors_mapper[color])
        
        same_colors.append(
            len(color_1 & color_2)
        )
        all_colors.append(
            len(color_1 | color_2)
        )
    df['same_colors'] = same_colors
    df['all_colors'] = all_colors
    df['iou_colors'] = df['same_colors'] / df['all_colors']
    df.loc[df['all_colors']==0, 'iou_colors'] = 0
    df['not_same_colors'] = df['all_colors'] - df['same_colors']
    
    clars_set_dist = []
    for name_1, name_2 in zip(names_1, names_2): 
        clars_1 = clar_tokenizer.tokenize(name_1)
        clars_2 = clar_tokenizer.tokenize(name_2)
        if len(clars_1) == 0 or len(clars_2) == 0:
            clars_set_dist.append(100 * (1 + (len(clars_1) == 0) + (len(clars_2) == 0)))
        else:
            clars_1 = [v[1:-1] for v in clars_1]
            clars_2 = [v[1:-1] for v in clars_2]
            clars_set_dist.append(
                fuzz.token_set_ratio(clars_1, clars_2)
            )
    df['clars_set_dist'] = clars_set_dist
        
    # embeddings    
    df['multimodal_tuned_1'] = list(data.loc[pairs.variantid1, 'multimodal_tuned_768'])
    df['multimodal_tuned_2'] = list(data.loc[pairs.variantid2, 'multimodal_tuned_768'])
    df['labse_tuned_tuned_1'] = list(data.loc[pairs.variantid1, 'labse_tuned_768'])
    df['labse_tuned_tuned_2'] = list(data.loc[pairs.variantid2, 'labse_tuned_768'])
    
    return df.drop(['variantid1', 'variantid2'], axis=1)

In [168]:
test_df = make_features(test_pairs, test_data)
rtest_df = make_features(rtest_pairs, test_data)

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18084 [00:00<?, ?it/s]

In [171]:
cat_features = [
    'cat3', 'cat4',
    'Страна-изготовитель_1', 'Страна-изготовитель_2', 
    'Оперативная память_1', 'Оперативная память_2',
    'Бренд процессора_1', 'Бренд процессора_2',
    'Модуль связи Bluetooth_1', 'Модуль связи Bluetooth_2',
    'Назначение_1', 'Назначение_2',
]
embedding_features = ['multimodal_tuned_1', 'multimodal_tuned_2', 'labse_tuned_tuned_1', 'labse_tuned_tuned_2']

In [175]:
model_cb_pseudo = CatBoostClassifier().load_model('./models/golden_model_cb_pseudo_ens.cbm')

In [177]:
test_pool = Pool(
    data=test_df[model_cb_pseudo.feature_names_],
    cat_features=cat_features,
    embedding_features=embedding_features,
    baseline=test_df['cb_pseudo_pred']
)

rtest_pool = Pool(
    data=rtest_df[model_cb_pseudo.feature_names_],
    cat_features=cat_features,
    embedding_features=embedding_features,
    baseline=rtest_df['cb_pseudo_pred']
)

In [178]:
preds = model_cb_pseudo.predict_proba(test_pool)[:,1]
preds

array([0.08422337, 0.15495005, 0.20565543, ..., 0.31311329, 0.8695845 ,
       0.53189708])

In [179]:
rpreds = model_cb_pseudo.predict_proba(rtest_pool)[:,1]
rpreds

array([0.08532164, 0.15181598, 0.18964924, ..., 0.30818035, 0.86922222,
       0.51878388])

In [180]:
final_preds = (preds + rpreds) / 2.
final_preds

array([0.0847725 , 0.15338302, 0.19765234, ..., 0.31064682, 0.86940336,
       0.52534048])

In [186]:
test_pairs['target'] = final_preds
test_pairs[['variantid1', 'variantid2', 'target']].to_csv('./golden_submit_ens_pseudo.csv', index=False)
test_pairs.drop('target', axis=1, inplace=True)