In [24]:
import gc
import os

import numpy as np
import pandas as pd
import random
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset

import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger, WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

In [25]:
torch.__version__, torch.cuda.is_available()

('2.0.1+cu117', True)

In [26]:
pl.seed_everything(56, workers=True)

56

In [27]:
columns = ['variantid', 'main_pic_embeddings_resnet_v1', 'name_bert_64', 'color_parsed']
train_data = pd.read_parquet('./datasets/train_data.parquet', columns=columns).set_index('variantid')
test_data = pd.read_parquet('./datasets/test_data.parquet', columns=columns).set_index('variantid')
train_data

Unnamed: 0_level_0,main_pic_embeddings_resnet_v1,name_bert_64,color_parsed
variantid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51195767,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...",[оранжевый]
53565809,"[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...",[красный]
56763357,"[[-0.90570974, 1.0296293, 1.0769907, 0.27746, ...","[-0.7384308, 0.70784587, 0.3012653, -0.3583719...",
56961772,"[[0.13133773, -0.5577079, 0.32498044, 0.191717...","[-0.44812852, 0.5283565, 0.28981736, -0.506841...",[черный]
61054740,"[[0.21696381, 0.10989461, -0.08012986, 0.69186...","[-0.72692573, 0.75206333, 0.37740713, -0.52502...",[черный]
...,...,...,...
820128810,"[[-1.4492652, -0.80129164, -0.12344764, 0.7194...","[-0.8253241, 0.6785133, 0.53978086, -0.4888316...",[пурпурный]
821135769,"[[0.012127608, -0.8534423, 0.5415518, -0.44912...","[-0.7413257, 0.46105132, 0.5639801, -0.5462132...",[черный]
822095690,"[[0.4248176, -0.15944786, -0.22844064, 0.42768...","[-0.49261805, 0.56726897, 0.7037877, -0.697246...",[черный]
822101044,"[[0.4248176, -0.15944786, -0.22844064, 0.42768...","[-0.44051006, 0.54029673, 0.63768685, -0.68040...",[черный]


In [28]:
train_pairs = pd.read_parquet('./datasets/train_pairs_w_target.parquet')
test_pairs = pd.read_parquet('./datasets/test_pairs_wo_target.parquet')
train_pairs['target'] = train_pairs['target'].astype(int)
train_pairs

Unnamed: 0,target,variantid1,variantid2
0,0,51197862,51198054
1,1,53062686,536165289
2,1,53602615,587809782
3,1,53888651,89598677
4,0,56930698,551526166
...,...,...,...
306535,0,817327230,822083612
306536,0,817560551,818069912
306537,0,817854719,817857267
306538,0,820036017,820037019


In [29]:
name_labse_embs = pd.read_parquet('F:/name_labse_embs.parquet').set_index('variantid')
name_labse_embs

Unnamed: 0_level_0,name_labse_768
variantid,Unnamed: 1_level_1
51195767,"[-0.033874325, 0.03722446, 0.0029757991, 0.068..."
53565809,"[0.015568526, -0.03899538, 0.064447366, 0.0383..."
56763357,"[-0.033072222, -0.04237577, 0.020771954, 0.065..."
56961772,"[0.014727573, -0.025661988, 0.023943473, -0.00..."
61054740,"[0.043145332, -0.052424084, 0.017260496, 0.045..."
...,...
820128810,"[-0.003678058, -0.031628493, 0.0065589263, 0.0..."
821135769,"[-0.06858361, 0.027011767, -0.016400583, -0.02..."
822095690,"[-0.04474233, -0.034224413, 0.026076552, 0.026..."
822101044,"[-0.05541598, 0.000863006, 0.01093415, 0.02208..."


In [30]:
color_vocab = {}
cur_id = 1
max_count = 0
for colors in np.concatenate([train_data['color_parsed'], test_data['color_parsed']]):
    if colors is None:
        continue
    max_count = max(max_count, len(colors))
    for value in colors:
        if value not in color_vocab:
            color_vocab[value] = cur_id
            cur_id += 1
cur_id, max_count

(248, 20)

In [31]:
def color_to_idx(colors):
    if colors is None:
        return []
    return [color_vocab[color] for color in colors]

def pad_colors(colors):
    max_len = 20
    if len(colors) > max_len:
        return colors[:max_len]
    return colors + [0] * (max_len - len(colors))

train_data['color_parsed'] = train_data['color_parsed'].apply(color_to_idx).apply(pad_colors)
test_data['color_parsed'] = test_data['color_parsed'].apply(color_to_idx).apply(pad_colors)
train_data['color_parsed']

variantid
51195767     [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
53565809     [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
56763357     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
56961772     [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
61054740     [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                   ...                        
820128810    [16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
821135769    [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
822095690    [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
822101044    [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
822394794    [27, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
Name: color_parsed, Length: 457063, dtype: object

In [32]:
from sklearn.model_selection import train_test_split

train_pairs, val_pairs = train_test_split(train_pairs, test_size=1/3, random_state=56)

In [73]:
class Args:
    batch_size = 128
    
args = Args()

In [74]:
class ItemsDataset(Dataset):
    def __init__(self, pairs, data):
        super().__init__()
        self.pairs = pairs.values
        self.main_pic_embs = data['main_pic_embeddings_resnet_v1']
        self.name_embs = data['name_bert_64']
        self.name_labse_embs = name_labse_embs['name_labse_768']
        self.colors = data['color_parsed']
        self.pairs_len = len(self.pairs)
        
    def __len__(self):
        return self.pairs_len

    def __getitem__(self, idx):
        target, id1, id2 = self.pairs[idx, :]
        
        return (
            torch.tensor(np.concatenate([self.main_pic_embs[id1][0], self.name_embs[id1], self.name_labse_embs[id1]])),
            torch.tensor(self.colors[id1]),
            torch.tensor(np.concatenate([self.main_pic_embs[id2][0], self.name_embs[id2], self.name_labse_embs[id2]])),
            torch.tensor(self.colors[id2]),
            torch.tensor(target)
        )

In [75]:
def get_data_loader(pairs, data, batch_size, drop_last, shuffle):
    dataset = ItemsDataset(pairs, data)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=0,
        drop_last=drop_last,
        shuffle=shuffle,
        pin_memory=True
    )
    return data_loader

In [76]:
def get_loaders(args):
    train_loader = get_data_loader(
        pairs=train_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=True,
        shuffle=True
    )
    
    val_loader = get_data_loader(
        pairs=val_pairs,
        data=train_data,
        batch_size=args.batch_size,
        drop_last=False,
        shuffle=False
    )
    return train_loader, val_loader

In [77]:
train_loader, val_loader = get_loaders(args)
len(train_loader), len(val_loader)

(1596, 799)

In [78]:
for x1, colors1, x2, colors2, target in train_loader:
    print(x1, colors1, x2, colors2, target)
    break

tensor([[ 0.7913, -0.3410, -1.1740,  ...,  0.0648, -0.0277,  0.0173],
        [-0.4890,  0.0393, -1.2099,  ...,  0.0233, -0.0398, -0.0351],
        [-1.1650,  0.4374, -0.0684,  ..., -0.0366, -0.0449,  0.0196],
        ...,
        [ 0.2943, -0.0427, -0.0534,  ..., -0.0074, -0.0324, -0.0109],
        [-0.1379,  0.2678,  0.6704,  ..., -0.0098,  0.0305,  0.0413],
        [ 0.5079, -0.5721, -0.9554,  ...,  0.0431, -0.0601, -0.0029]]) tensor([[ 6,  0,  0,  ...,  0,  0,  0],
        [35,  9,  7,  ...,  0,  0,  0],
        [ 3,  0,  0,  ...,  0,  0,  0],
        ...,
        [ 7,  0,  0,  ...,  0,  0,  0],
        [ 3,  0,  0,  ...,  0,  0,  0],
        [ 3,  0,  0,  ...,  0,  0,  0]]) tensor([[ 0.8538, -0.2273, -1.1336,  ...,  0.0446, -0.0332,  0.0080],
        [-0.5894,  0.0583, -1.1590,  ...,  0.0236,  0.0097, -0.0238],
        [-1.7001,  0.2358, -0.2245,  ..., -0.0366, -0.0426,  0.0158],
        ...,
        [ 0.1755,  0.4481, -0.1310,  ..., -0.0074, -0.0324, -0.0109],
        [-0.4031, -

In [79]:
def weights_init_kaiming(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode="fan_in")
    elif classname.find("Linear") != -1:
        torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode="fan_out")
    elif classname.find("BatchNorm1d") != -1:
        torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
    if hasattr(m, "bias") and m.bias is not None:
        torch.nn.init.constant_(m.bias.data, 0.0)


def weights_init_classifier(m):
    classname = m.__class__.__name__
    if classname.find("Linear") != -1:
        torch.nn.init.normal_(m.weight.data, std=0.001)
        torch.nn.init.constant_(m.bias.data, 0.0)


class ClassBlock(torch.nn.Module):
    def __init__(
        self, input_dim, class_num, droprate, relu=False, bnorm=True, linear=512
    ):
        super(ClassBlock, self).__init__()
        add_block = []
        if linear > 0:
            add_block += [torch.nn.Linear(input_dim, linear)]
        else:
            linear = input_dim
        if bnorm:
            add_block += [torch.nn.BatchNorm1d(linear)]
        if relu:
            add_block += [torch.nn.LeakyReLU(0.1)]
        if droprate > 0:
            add_block += [torch.nn.Dropout(p=droprate)]
        add_block = torch.nn.Sequential(*add_block)
        add_block.apply(weights_init_kaiming)

        classifier = torch.nn.Linear(linear, class_num)
        classifier.apply(weights_init_classifier)
        self.sigmoid = nn.Sigmoid()

        self.add_block = add_block
        self.classifier = classifier

    def forward(self, x):
        features = self.add_block(x)
        output = self.classifier(features)
        return features, self.sigmoid(output).squeeze()

In [80]:
# from https://github.com/TinyZeaMays/CircleLoss/blob/master/circle_loss.py

from typing import Tuple

import torch
from torch import nn, Tensor


def convert_label_to_similarity(normed_feature: Tensor, label: Tensor) -> Tuple[Tensor, Tensor]:
    similarity_matrix = normed_feature @ normed_feature.transpose(1, 0)
    label_matrix = label.unsqueeze(1) == label.unsqueeze(0)

    positive_matrix = label_matrix.triu(diagonal=1)
    negative_matrix = label_matrix.logical_not().triu(diagonal=1)

    similarity_matrix = similarity_matrix.view(-1)
    positive_matrix = positive_matrix.view(-1)
    negative_matrix = negative_matrix.view(-1)
    return similarity_matrix[positive_matrix], similarity_matrix[negative_matrix]


class CircleLoss(nn.Module):
    def __init__(self, m: float, gamma: float) -> None:
        super(CircleLoss, self).__init__()
        self.m = m
        self.gamma = gamma
        self.soft_plus = nn.Softplus()

    def forward(self, sp: Tensor, sn: Tensor) -> Tensor:
        ap = torch.clamp_min(-sp.detach() + 1 + self.m, min=0.0)
        an = torch.clamp_min(sn.detach() + self.m, min=0.0)

        delta_p = 1 - self.m
        delta_n = self.m

        logit_p = -ap * (sp - delta_p) * self.gamma
        logit_n = an * (sn - delta_n) * self.gamma

        loss = self.soft_plus(torch.logsumexp(logit_n, dim=0) + torch.logsumexp(logit_p, dim=0))

        return loss

In [81]:
from pytorch_metric_learning import losses

class ReIdentificationLossWithClassification(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.CircleLoss = CircleLoss(m=0.25, gamma=64)
        self.CrossEntropyLoss = nn.BCELoss()
        self.ContrastLoss = losses.ContrastiveLoss(pos_margin=0, neg_margin=1)

    def forward(self, features, logits, labels):
        BS = labels.shape[0]

        fnorm = torch.norm(features, p=2, dim=1, keepdim=True)
        features = features.div(fnorm.expand_as(features))

        ce_loss = self.CrossEntropyLoss(logits, labels)
        contrast_loss = self.ContrastLoss(features, labels)
        circle_loss = self.CircleLoss(*convert_label_to_similarity(features, labels)) / BS

        loss = contrast_loss + ce_loss + circle_loss

        return loss

In [82]:
from sklearn.metrics import roc_auc_score

class Net(pl.LightningModule):
    margin = 0.75
    
    def __init__(self):
        super().__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=cur_id, 
            embedding_dim=(cur_id + 1) // 2, 
            padding_idx=0
        )
        self.lstm_hidden = 64 # mby it is about vocab_size / 4
        self.lstm = nn.LSTM(
            input_size=(cur_id + 1) // 2, 
            hidden_size=self.lstm_hidden, 
            num_layers=1, 
            batch_first=True,
            bidirectional=True
        )
        
        # self.linear = nn.Linear(128 + 64, 256)
        
        features_num = 128 + 64 + 768 + self.lstm_hidden*2
        # output_size = 256
        # embedding_size = (features_num + output_size) // 2
        
        self.stabilizer = nn.Sequential(
            nn.Linear(features_num, features_num, bias=False),
            nn.Dropout(p=0.1)
        )
        
        embedding_size = 512
        self.neck = nn.Sequential(
            nn.BatchNorm1d(features_num),
            nn.Linear(features_num, embedding_size, bias=False),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(embedding_size),
            nn.Linear(embedding_size, embedding_size, bias=False),
            nn.BatchNorm1d(embedding_size),
        )
        # self.linear = nn.Linear(embedding_size, output_size)
        
        # add linear layers to compare between the features of the two images
        self.classifier = ClassBlock(
            input_dim=embedding_size*2, 
            class_num=1, 
            droprate=0, 
            relu=True,
            bnorm=False,
            linear=embedding_size
        )
        
        self.criterion = ReIdentificationLossWithClassification()
        
    def forward(self, x, colors):
        colors_emb = self.embedding(colors)
        output, (ht, ct) = self.lstm(colors_emb)
        # colors_output = output[:, -1, :]
        
        out_forward = output[:, -1, :self.lstm_hidden]
        out_reverse = output[:, 0, self.lstm_hidden:]
        colors_output = torch.cat([out_forward, out_reverse], 1)
        
        # return self.linear(x)
        # return self.linear(self.neck(x))
        x = torch.cat([x, colors_output], dim=1)
        return self.neck(self.stabilizer(x))
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(), lr=1e-4, betas=(0.9, 0.999), weight_decay=0.05
        )
        # optimizer = torch.optim.SGD(self.parameters(), lr=1e-1, momentum=0.9, weight_decay=0.0001)
        return (
            {
                "optimizer": optimizer,
            },
        )
    
    def training_step(self, batch, batch_idx):
        # self.log('step', batch_idx, logger=True, on_epoch=True)
        x1, colors1, x2, colors2, labels = batch
        out1 = self.forward(x1, colors1)
        out2 = self.forward(x2, colors2)
        
        # concatenate both images' features
        x = torch.cat((out1, out2), 1)
        # pass the concatenation to the linear layers
        features, output = self.classifier(x)
        loss = 0.5 * self.criterion(features, output, (1 - labels).to(torch.float32))
        
        # concatenate both images' features
        x = torch.cat((out2, out1), 1)
        # pass the concatenation to the linear layers
        features, output = self.classifier(x)
        loss += 0.5 * self.criterion(features, output, (1 - labels).to(torch.float32))
        
        # fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        # out1 = out1.div(fnorm.expand_as(out1))
        # fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        # out2 = out2.div(fnorm.expand_as(out2))
        
        # dists = nn.PairwiseDistance()(out1, out2)
        # loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        # loss = torch.mean(loss)
        self.log("train_loss", loss, on_step=False, logger=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):        
        x1, colors1, x2, colors2, labels = batch
        out1 = self.forward(x1, colors1)
        out2 = self.forward(x2, colors2)
        
        # concatenate both images' features
        x = torch.cat((out1, out2), 1)
        # pass the concatenation to the linear layers
        features, output1 = self.classifier(x)
        loss = self.criterion(features, output1, (1 - labels).to(torch.float32))
        
        # concatenate both images' features
        x = torch.cat((out2, out1), 1)
        # pass the concatenation to the linear layers
        features, output2 = self.classifier(x)
        
        output = (output1 + output2) / 2
        
        # fnorm = torch.norm(out1, p=2, dim=1, keepdim=True)
        # out1 = out1.div(fnorm.expand_as(out1))
        # fnorm = torch.norm(out2, p=2, dim=1, keepdim=True)
        # out2 = out2.div(fnorm.expand_as(out2))
        
        # dists = nn.PairwiseDistance()(out1, out2)
        # loss = (labels) * torch.pow(dists, 2) + (1 - labels) * torch.pow(torch.clamp(self.margin - dists, min=0.0), 2)
        # loss = torch.mean(loss)
        self.log("val_loss", loss, logger=False, on_epoch=True, prog_bar=True)   
        
        try:
            auc = roc_auc_score(labels.detach().cpu(), 1 - output.detach().cpu())
        except:
            auc = 0
            
        self.log("val_auc", auc, logger=False, on_epoch=True, prog_bar=True)
        
    def train_dataloader(self):
        return train_loader

    def val_dataloader(self):
        return val_loader
    
    def predict_step(self, batch, batch_idx=0):
        x1, colors1, x2, colors2, labels = batch
        out1 = self.forward(x1, colors1)
        out2 = self.forward(x2, colors2)
        
        # concatenate both images' features
        x = torch.cat((out1, out2), 1)
        # pass the concatenation to the linear layers
        features1, output1 = self.classifier(x)
        
        # concatenate both images' features
        x = torch.cat((out2, out1), 1)
        # pass the concatenation to the linear layers
        features2, output2 = self.classifier(x)
        
        fnorm = torch.norm(features1, p=2, dim=1, keepdim=True)
        features1 = features1.div(fnorm.expand_as(features1))
        fnorm = torch.norm(features2, p=2, dim=1, keepdim=True)
        features2 = features2.div(fnorm.expand_as(features2))
        
        output = (output1 + output2) / 2
        features = (features1 + features2) / 2
        return torch.cat([features, (1 - output).unsqueeze(-1)], dim=1).detach().cpu()

In [83]:
model = Net()
model

Net(
  (embedding): Embedding(248, 124, padding_idx=0)
  (lstm): LSTM(124, 64, batch_first=True, bidirectional=True)
  (stabilizer): Sequential(
    (0): Linear(in_features=1088, out_features=1088, bias=False)
    (1): Dropout(p=0.1, inplace=False)
  )
  (neck): Sequential(
    (0): BatchNorm1d(1088, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=1088, out_features=512, bias=False)
    (2): ReLU(inplace=True)
    (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=512, out_features=512, bias=False)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (classifier): ClassBlock(
    (sigmoid): Sigmoid()
    (add_block): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): LeakyReLU(negative_slope=0.1)
    )
    (classifier): Linear(in_features=512, out_features=1, bias=True)
  )
  (criterion): ReIdentificat

In [84]:
import torchinfo

torchinfo.summary(model)

Layer (type:depth-idx)                             Param #
Net                                                --
├─Embedding: 1-1                                   30,752
├─LSTM: 1-2                                        97,280
├─Sequential: 1-3                                  --
│    └─Linear: 2-1                                 1,183,744
│    └─Dropout: 2-2                                --
├─Sequential: 1-4                                  --
│    └─BatchNorm1d: 2-3                            2,176
│    └─Linear: 2-4                                 557,056
│    └─ReLU: 2-5                                   --
│    └─BatchNorm1d: 2-6                            1,024
│    └─Linear: 2-7                                 262,144
│    └─BatchNorm1d: 2-8                            1,024
├─ClassBlock: 1-5                                  --
│    └─Sigmoid: 2-9                                --
│    └─Sequential: 2-10                            --
│    │    └─Linear: 3-1                    

In [85]:
trainer = pl.Trainer(
    logger=False, # CSVLogger('./'),
    enable_checkpointing=False,
    
    accelerator='gpu', 
    devices=1,
    profiler='advanced',
    precision=32,
    check_val_every_n_epoch=1,
    max_epochs=10
)

trainer.fit(model)

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(

  rank_zero_warn(



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [86]:
test_preds = trainer.predict(model, val_loader)

  rank_zero_warn(



Predicting: 0it [00:00, ?it/s]

In [87]:
roc_auc_score(val_pairs.target, np.concatenate([pred.numpy()[:, -1] for pred in test_preds]))

0.8647896305665499

In [88]:
train_dataset = ItemsDataset(train_pairs, train_data)
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    num_workers=0,
    drop_last=False,
    shuffle=False,
    pin_memory=True
)

In [89]:
features = np.concatenate([pred.numpy() for pred in trainer.predict(model, train_loader)])

  rank_zero_warn(



Predicting: 0it [00:00, ?it/s]

In [90]:
val_features = np.concatenate([pred.numpy() for pred in trainer.predict(model, val_loader)])

Predicting: 0it [00:00, ?it/s]

In [91]:
from catboost import CatBoostClassifier, Pool, cv

In [92]:
train_pool = Pool(
    data=features,
    label=train_pairs.target,
)

val_pool = Pool(
    data=val_features,
    label=val_pairs.target,
)

In [93]:
params = {
    'loss_function': 'Logloss',
    'custom_metric': ['AUC'],
    'task_type': 'CPU',
}

In [94]:
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1b4231dc430>

In [95]:
np.max(model_cb.get_evals_result()['validation']['AUC'])

0.8716759840727998

In [None]:
from sklearn.metrics import precision_recall_curve, auc

precision, recall, thrs = precision_recall_curve(val_pairs.target, model_cb.predict_proba(val_pool)[:, 1])
gt_prec_level_idx = np.where(precision >= 0.75)[0]
auc(recall[gt_prec_level_idx], precision[gt_prec_level_idx])