In [1]:
import os
import gc
import multiprocessing
import warnings
import random
from tqdm.auto import tqdm

from PIL import Image
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import ChainedScheduler, LinearLR, ExponentialLR
from torch.nn import CrossEntropyLoss
from torch.utils.data import default_collate

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizerFast

from utils import get_title, preprocess_text_field, MeanPooling, Attention

def seed_everything(seed=42, deterministic=False):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = deterministic
    torch.backends.cudnn.benchmark = False

#### Основные настройки: seed, модель, рабочий каталог, warnings.

In [2]:
SEED = 42
WORKDIR = '//kaggle/input/kazan-exress-1/'
IMAGES_FOLDER = os.path.join(WORKDIR, 'row_data/images/train/')
warnings.filterwarnings("ignore")
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
seed_everything(SEED, deterministic=True)

%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)
print('CPU cores: ', multiprocessing.cpu_count())

# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = multiprocessing.cpu_count()
    clip_model = "openai/clip-vit-large-patch14"
    clip_tokenizer = CLIPTokenizerFast.from_pretrained(clip_model)
    clip_processor = CLIPProcessor.from_pretrained(clip_model)
    clip_embeddings = np.load(os.path.join(WORKDIR, 'embeddings.np.npy'))
    clip_cut_emb = 32
    bert_model = 'cointegrated/rubert-tiny2' 
    bert_tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
    bert_cut_emb = None
    state_dict = None
    max_length = 256


env: TOKENIZERS_PARALLELISM=false
Device:  cuda
CPU cores:  2


#### Преобразование входных данных

In [15]:
# Read from parquet
data_full = pd.read_parquet('/kaggle/input/kazanexpress-data-with-categories/train.parquet')
# Drop unnecessary columns
data_full.drop(columns=['shop_id', 'rating'], inplace=True)
# Convert text fields
data_full['title'] = data_full.text_fields.apply(get_title)
data_full.text_fields = data_full.text_fields.apply(preprocess_text_field)
# Convert "Sale"
data_full['sale'] = data_full['sale'].apply(lambda x: "Распродажа!" if x else "")  
data_full.fillna(value='', inplace=True)
# Concatenate to one string
data_full = data_full.assign(Document=[str(y) + ': ' + str(x) + '. ' + str(z) + '. ' + str(s) + '. ' \
                                       for x, y, z, s in zip(data_full['title'], data_full['shop_title'],
                                                           data_full['text_fields'], data_full['sale'])])

data_full = data_full.drop(columns=['text_fields', 'shop_title', 'sale', 'title']).reset_index(drop=True)
# Drop too rare values
drop_ids = set(data_full.category_id.value_counts()[data_full.category_id.value_counts() < 2].index)
data_full = data_full[~data_full['category_id'].isin(drop_ids)]
# Trait/test split
if CFG.clip_embeddings is not None:
    data, data_valid, clip_embeddings, clip_embeddings_valid = train_test_split(data_full, CFG.clip_embeddings, 
                                                                    test_size=0.2, random_state=SEED, 
                                                                    shuffle=True, stratify=data_full.category_id)
else:
    data, data_valid_stack = train_test_split(data_full, test_size=0.2, random_state=SEED, 
                                        shuffle=True, stratify=data_full.category_id)
    
data.reset_index(drop=True, inplace=True)
data_valid.reset_index(drop=True, inplace=True)
# Fix class umbers 
cls2id = data_full.category_id.unique()
id2cls = {k : v for v, k in enumerate(cls2id)}

# del data_full
id2category = {k:v[15:] for k, v in zip(data_full.category_id.tolist(), data_full.category_name.tolist())}

#### Классы датасета и модели. 
Модель и датасет позволяют загружать как модель CLIP (собственную или с huggingface), так и готовые (ранее сгенерированные и сохранунные в numpy array) эмбеддинги CLIP.

In [5]:
# =========================================================================================
# Dataset
# =========================================================================================
class stacked_dataset(Dataset):
    def __init__(self, cfg, documents:list, targets: list, 
                 id2cls: dict, images_folder: str, 
                 product_ids:list, clip_embeddings=None):
        
        if clip_embeddings is not None:
            self.use_precalculated_clip_embs = True
            self.clip_embeddings = clip_embeddings
        else:
            self.use_precalculated_clip_embs = False
        
        self.cfg = cfg
        self.data = documents
        self.targets = targets
        self.id2cls = id2cls
        self.images_folder = images_folder
        self.product_ids = product_ids
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        
        if self.use_precalculated_clip_embs:
            image_inputs = self.clip_embeddings[item][None, :]
        else:
            image = Image.open(os.path.join(self.images_folder, str(self.product_ids[item]) + '.jpg'))
            image_inputs = self.cfg.clip_processor(
                    text=None,
                    images=image,
                    return_tensors='pt'
                )['pixel_values']
        
        text_inputs=self.cfg.bert_tokenizer(
                self.data[item], 
                return_tensors=None, 
                add_special_tokens=True, 
                max_length=self.cfg.max_length,
                truncation=True
            )
                
        return text_inputs, image_inputs, self.id2cls[self.targets[item]]

# =========================================================================================
# Unsupervised model
# =========================================================================================

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class Attention(nn.Module):
    def __init__(self, query_dim, value_dim):
        super(Attention, self).__init__()
        self.fc = nn.Linear(query_dim, value_dim, bias=False)
    def forward(self, query_emb, value_emb):
        attention = torch.sigmoid(self.fc(query_emb))
        return value_emb * attention
    
class STACKED_CLF(nn.Module):
    def __init__(self, cfg, n_classes):
        super().__init__()
        # Configurations, CLIP and BERT models loading
        self.cfg = cfg
        if cfg.clip_embeddings is None:
            self.clip_config = AutoConfig.from_pretrained(cfg.model)
            self.clip_model = CLIPModel.from_pretrained(cfg.model)
        else: 
            self.use_precalculated_clip_embs = True
            
        self.bert_config = AutoConfig.from_pretrained(cfg.bert_model)
        self.bert_model = AutoModel.from_pretrained(cfg.bert_model, config = self.bert_config)
        self.bert_pool = MeanPooling()
        # CLIP embeddings from model or precalculated
        if cfg.bert_cut_emb is None:
            self.hidden_dim = self.bert_model.config.hidden_size + cfg.clip_cut_emb
        else:
            self.hidden_dim = cfg.bert_cut_emb + cfg.clip_cut_emb
        # Attentions
        self.attention_clip = Attention(self.hidden_dim-cfg.clip_cut_emb, cfg.clip_cut_emb)
        self.attention_bert = Attention(cfg.clip_cut_emb, self.hidden_dim-cfg.clip_cut_emb)
        # Classifier
        self.bn = nn.BatchNorm1d(self.hidden_dim)
        self.clf = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, text_inputs, image_inputs):
        # Get BERT embeddings from text
        text_emb = self.bert_model(**text_inputs)
        text_emb = self.bert_pool(text_emb.last_hidden_state, text_inputs['attention_mask']) 
        # Get CLIP embeddings from pictures
        if self.use_precalculated_clip_embs:
            img_emb = image_inputs 
        else:
            img_emb = self.clip_model.get_image_features(image_inputs)
        # Cut embeddings
        text_emb = text_emb[:, :self.cfg.bert_cut_emb]
        img_emb = img_emb[:, :self.cfg.clip_cut_emb]
        # Apply attentions
        img_emb = self.attention_clip(text_emb, img_emb)
        text_emb = self.attention_bert(img_emb, text_emb)
        # Concatenate BERT and CLIP embeddings
        emb  = torch.cat([text_emb, img_emb], dim=1).float()
        # Classifier
        cls = self.clf(self.bn(emb))
        return cls

#### Функция collate_fn и даталоадер:

In [6]:
transformers_collator = DataCollatorWithPadding(tokenizer = CFG.bert_tokenizer, padding = 'longest')

def custom_collate(batch):
    texts_batch = []
    images_batch = []
    targets_batch = []
    for item in batch:
        texts_batch.append(item[0])
        images_batch.append(item[1][0])
        targets_batch.append(item[2])
    text_inputs = transformers_collator(texts_batch)
    return text_inputs, default_collate(images_batch), default_collate(targets_batch)


train_loader = DataLoader(
    stacked_dataset(CFG, documents=data.Document.tolist(), targets=data.category_id.tolist(), 
                  id2cls=id2cls, images_folder=IMAGES_FOLDER, 
                  product_ids=data.product_id.tolist(), clip_embeddings=clip_embeddings), 
    batch_size = 256, 
    shuffle = True, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

valid_loader = DataLoader(
    stacked_dataset(CFG, documents=data_valid.Document.tolist(), targets=data_valid.category_id.tolist(), 
                  id2cls=id2cls, images_folder=IMAGES_FOLDER, 
                  product_ids=data_valid.product_id.tolist(), clip_embeddings=clip_embeddings_valid), 
    batch_size = 256, 
    shuffle = False, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)


#### Train loop.

In [1]:
def train(model, train_loader, test_loader, 
          epochs=20, lr=0.0001, checkpoint_period=None, weight_decay=1e-7,
          warmup_epochs=2, gamma=0.93, verbose=True):
    
    opt = AdamW(model.parameters(), lr=lr * gamma ** -warmup_epochs, weight_decay=weight_decay)
    
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.01)   
    opt.zero_grad() 
    torch.cuda.empty_cache()
    # gc.collect()
    scheduler = ChainedScheduler([LinearLR(opt, start_factor=0.02, total_iters=warmup_epochs),
                                  ExponentialLR(opt, gamma=gamma)])
    if checkpoint_period is None:
        checkpoint_period = len(train_loader)
    
    max_f1 = 0.
    best_epoch = 0
    
    for epoch in tqdm(range(1, epochs+1, 1)):
        # TRAIN
        model.train()
        loss_avg = 0.
        if verbose:
            print(f'Epoch={epoch}')
            print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
        for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), disable = not verbose):
            text_input = batch[0].to(device)
            image_input = batch[1].to(device)
            target = batch[2].to(device)
            output = model(text_input, image_input)
            loss = loss_fn(output, target)
            loss.backward()
            opt.step()
            opt.zero_grad()
            loss_avg += loss.item() / checkpoint_period
            if step % checkpoint_period == checkpoint_period - 1:
                if verbose:
                    print(f'Step={step+1}, Train loss={loss_avg:.6f}')
                loss_avg = 0.
                torch.save(model.state_dict(), os.path.join(WORKDIR, 'checkpoints/checkpoint.pt'))
                model.eval()
                grun_truth = []
                predicted = []                
                with torch.no_grad():
                    precision = 0.
                    recall = 0.
                    total = min(len(test_loader), (checkpoint_period // 2))
                    for step, batch in enumerate(test_loader):
                        text_input = batch[0].to(device)
                        image_input = batch[1].to(device)
                        target = batch[2].to(device)
                        output = model(text_input, image_input)
                        loss = loss_fn(output, target)
                        loss_avg += loss.item() / total
                        grun_truth.append(target.cpu())
                        predicted.append(output.argmax(dim=1).cpu())
                        if step >= checkpoint_period // 2 - 1:
                            break
                weighted_f1 = f1_score(np.concatenate(grun_truth), np.concatenate(predicted), average='weighted')
                
                if weighted_f1 > max_f1:
                    max_f1 = weighted_f1
                    best_epoch = epoch
                    torch.save(model.state_dict(), os.path.join(WORKDIR, 'best.pt'))

                if verbose:
                    print(f"F1={weighted_f1:.5f}")
                    print(f'Eval loss={loss_avg:.5f}\n')
                loss_avg = 0.
                model.train()
                scheduler.step()
    return max_f1, best_epoch

#### Создание модели.

In [8]:
model = STACKED_CLF(CFG, len(cls2id)).to(device)
if CFG.state_dict is not None:
    model.load_state_dict(CFG.state_dict)
torch.cuda.empty_cache()
gc.collect()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Обучение модели.

In [9]:
train(model, train_loader, valid_loader, warmup_epochs=2, epochs=15, lr=0.0004, gamma=0.93)

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch=1
Lr: 0.000009250


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=6.208466
F1=0.17484
Eval loss=5.42725

Epoch=2
Lr: 0.000219355


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=2.135760
F1=0.79485
Eval loss=0.95374

Epoch=3
Lr: 0.000400000


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.659543
F1=0.86740
Eval loss=0.61421

Epoch=4
Lr: 0.000372000


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.335750
F1=0.88267
Eval loss=0.57403

Epoch=5
Lr: 0.000345960


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.225562
F1=0.88743
Eval loss=0.57602

Epoch=6
Lr: 0.000321743


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.178988
F1=0.88656
Eval loss=0.58686

Epoch=7
Lr: 0.000299221


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.159071
F1=0.88829
Eval loss=0.60563

Epoch=8
Lr: 0.000278275


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.150054
F1=0.88808
Eval loss=0.62492

Epoch=9
Lr: 0.000258796


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.145846
F1=0.89022
Eval loss=0.62536

Epoch=10
Lr: 0.000240680


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.142568
F1=0.88927
Eval loss=0.63574

Epoch=11
Lr: 0.000223833


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.140059
F1=0.88901
Eval loss=0.64354

Epoch=12
Lr: 0.000208164


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.138888
F1=0.88828
Eval loss=0.65133

Epoch=13
Lr: 0.000193593


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.137706
F1=0.88890
Eval loss=0.65808

Epoch=14
Lr: 0.000180041


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.136736
F1=0.88795
Eval loss=0.66512

Epoch=15
Lr: 0.000167439


  0%|          | 0/285 [00:00<?, ?it/s]

Step=285, Train loss=0.135940
F1=0.88680
Eval loss=0.67078



(0.8902174169948255, 9)