In [1]:
import os
import gc
import random
import multiprocessing
import warnings
from tqdm.auto import tqdm

from PIL import Image
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import ChainedScheduler, LinearLR, ExponentialLR
from torch.nn import CrossEntropyLoss
from torch.utils.data import default_collate

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizerFast

from utils import get_title, preprocess_text_field, MeanPooling, Attention

def seed_everything(seed=42, deterministic=False):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = deterministic
    torch.backends.cudnn.benchmark = False

#### Основные настройки: seed, модель, рабочий каталог, warnings.

In [2]:
SEED = 42
WORKDIR = '//home/ubuntu/gitrepo/KazanExpress/2/'
IMAGES_FOLDER = os.path.join(WORKDIR, 'row_data/images/train/')
IMAGES_FOLDER_TEST = os.path.join(WORKDIR, 'row_data/images/test/')
warnings.filterwarnings("ignore")
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
seed_everything(seed=42, deterministic=True)

%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)
print('CPU cores: ', multiprocessing.cpu_count())

# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = multiprocessing.cpu_count()
    clip_model = "openai/clip-vit-large-patch14"
    clip_tokenizer = CLIPTokenizerFast.from_pretrained(clip_model)
    clip_processor = CLIPProcessor.from_pretrained(clip_model)
    clip_embeddings = np.load('embeddings_clip.np.npy')
    clip_embeddings_test = np.load('embeddings_clip_test.np.npy')
    clip_cut_emb = 40
    bert_model = 'cointegrated/rubert-tiny2' 
    bert_tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
    bert_cut_emb = None
    state_dict = None
    max_length = 256


env: TOKENIZERS_PARALLELISM=false
Device:  cuda
CPU cores:  8


#### Преобразование входных данных.

In [3]:
# Read from parquet
data_full = pd.read_parquet(os.path.join(WORKDIR, 'row_data/new/train.parquet'))
# Drop unnecessary columns
data_full.drop(columns=['shop_id', 'rating'], inplace=True)
# Convert text fields
data_full['title'] = data_full.text_fields.apply(get_title)
data_full.text_fields = data_full.text_fields.apply(preprocess_text_field)
# Convert "Sale"
data_full['sale'] = data_full['sale'].apply(lambda x: "Распродажа!" if x else "")  
data_full.fillna(value='', inplace=True)
# Concatenate to one string
data_full = data_full.assign(Document=[str(y) + ': ' + str(x) + '. ' + str(z) + '. ' + str(s) + '. ' \
                                       for x, y, z, s in zip(data_full['title'], data_full['shop_title'],
                                                           data_full['text_fields'], data_full['sale'])])

data_full = data_full.drop(columns=['text_fields', 'shop_title', 'sale', 'title']).reset_index(drop=True)
# Duplicate too rare values
dup_ids = set(data_full.category_id.value_counts()[data_full.category_id.value_counts() < 2].index)
data_full = data_full.append(data_full[data_full['category_id'].isin(dup_ids)])
# Trait/test split
if CFG.clip_embeddings is not None:
    data, data_valid, clip_embeddings, clip_embeddings_valid = train_test_split(data_full, CFG.clip_embeddings, 
                                                                    test_size=0.025, random_state=SEED, 
                                                                    shuffle=True, stratify=data_full.category_id)
else:
    data, data_valid_stack = train_test_split(data_full, test_size=0.025, random_state=SEED, 
                                        shuffle=True, stratify=data_full.category_id)
    
data.reset_index(drop=True, inplace=True)
data_valid.reset_index(drop=True, inplace=True)
# Fix class umbers 
cls2id = data_full.category_id.unique()
id2cls = {k : v for v, k in enumerate(cls2id)}

# del data_full
id2category = {k:v[15:] for k, v in zip(data_full.category_id.tolist(), data_full.category_name.tolist())}

#### Преобразование входных данных из файла "test"

In [4]:
# Read from parquet
data_test = pd.read_parquet(os.path.join(WORKDIR, 'row_data/new/test.parquet'))
# Drop unnecessary columns
data_test.drop(columns=['shop_id', 'rating'], inplace=True)
# Convert text fields
data_test['title'] = data_test.text_fields.apply(get_title)
data_test.text_fields = data_test.text_fields.apply(preprocess_text_field)
# Convert "Sale"
data_test['sale'] = data_test['sale'].apply(lambda x: "Распродажа!" if x else "")  
data_test.fillna(value='', inplace=True)
# Concatenate to one string
data_test = data_test.assign(Document=[str(y) + ': ' + str(x) + '. ' + str(z) + '. ' + str(s) + '. ' \
                                       for x, y, z, s in zip(data_test['title'], data_test['shop_title'],
                                                           data_test['text_fields'], data_test['sale'])])

data_test = data_test.drop(columns=['text_fields', 'shop_title', 'sale', 'title']).reset_index(drop=True)
data_test.reset_index(drop=True, inplace=True)

#### Классы датасета и модели. 
Модель и датасет позволяют загружать как модель CLIP (собственную или с huggingface), так и готовые (ранее сгенерированные и сохранунные в numpy array) эмбеддинги CLIP.

In [5]:
# =========================================================================================
# Dataset
# =========================================================================================
class stacked_dataset(Dataset):
    def __init__(self, cfg, documents:list, targets: list, 
                 id2cls: dict, images_folder: str, 
                 product_ids:list, clip_embeddings=None):
        # Use precalculated embeddings or preprocess images
        if clip_embeddings is not None:
            self.use_precalculated_clip_embs = True
            self.clip_embeddings = clip_embeddings
        else:
            self.use_precalculated_clip_embs = False
        
        self.cfg = cfg
        self.data = documents
        self.targets = targets
        self.id2cls = id2cls
        self.images_folder = images_folder
        self.product_ids = product_ids
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        # Use precalculated embeddings or preprocess images
        if self.use_precalculated_clip_embs:
            image_inputs = self.clip_embeddings[item][None, :]
        else:
            image = Image.open(os.path.join(self.images_folder, str(self.product_ids[item]) + '.jpg'))
            image_inputs = self.cfg.clip_processor(
                    text=None,
                    images=image,
                    return_tensors='pt'
                )['pixel_values']
        # Preprocess text for BERT
        text_inputs=self.cfg.bert_tokenizer(
                self.data[item], 
                return_tensors=None, 
                add_special_tokens=True, 
                max_length=self.cfg.max_length,
                truncation=True
            )
                
        return text_inputs, image_inputs, self.id2cls[self.targets[item]]

# =========================================================================================
# Classifier model
# =========================================================================================
class STACKED_CLF(nn.Module):
    def __init__(self, cfg, n_classes, img_emb_mult=0.001):
        super().__init__()
        # Configurations, CLIP and BERT models loading
        self.cfg = cfg
        self.img_emb_mult = img_emb_mult
        if cfg.clip_embeddings is None:
            self.clip_config = AutoConfig.from_pretrained(cfg.model)
            self.clip_model = CLIPModel.from_pretrained(cfg.model)
        else: 
            self.use_precalculated_clip_embs = True
        self.bert_config = AutoConfig.from_pretrained(cfg.bert_model)
        self.bert_model = AutoModel.from_pretrained(cfg.bert_model, config = self.bert_config)
        self.bert_pool = MeanPooling()
        # CLIP embeddings from model or precalculated
        if cfg.bert_cut_emb is None:
            self.hidden_dim = self.bert_model.config.hidden_size + cfg.clip_cut_emb
        else:
            self.hidden_dim = cfg.bert_cut_emb + cfg.clip_cut_emb
        # Attentions
        self.attention_clip = Attention(self.hidden_dim-cfg.clip_cut_emb, cfg.clip_cut_emb)
        self.attention_bert = Attention(cfg.clip_cut_emb, self.hidden_dim-cfg.clip_cut_emb)
        # Classifier
        self.bn = nn.BatchNorm1d(self.hidden_dim)
        self.clf = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, text_inputs, image_inputs):
        # Get BERT embeddings from text
        text_emb = self.bert_model(**text_inputs)
        text_emb = self.bert_pool(text_emb.last_hidden_state, text_inputs['attention_mask']) 
        # Get CLIP embeddings from pictures
        if self.use_precalculated_clip_embs:
            img_emb = image_inputs 
        else:
            img_emb = self.clip_model.get_image_features(image_inputs)
        # Cut embeddings
        text_emb = text_emb[:, :self.cfg.bert_cut_emb]
        img_emb = img_emb[:, :self.cfg.clip_cut_emb]
        # Apply attentions
        img_emb = self.attention_clip(text_emb, img_emb) * self.img_emb_mult # Regularisation
        text_emb = self.attention_bert(img_emb, text_emb)
        # Concatenate BERT and CLIP embeddings
        emb  = torch.cat([text_emb, img_emb], dim=1).float()
        # Classifier
        cls = self.clf(self.bn(emb))
        return cls

#### Функция collate_fn и даталоадер:

In [6]:
transformers_collator = DataCollatorWithPadding(tokenizer = CFG.bert_tokenizer, padding = 'longest')

def custom_collate(batch):
    texts_batch = []
    images_batch = []
    targets_batch = []
    for item in batch:
        texts_batch.append(item[0])
        images_batch.append(item[1][0])
        targets_batch.append(item[2])
    text_inputs = transformers_collator(texts_batch)
    return text_inputs, default_collate(images_batch), default_collate(targets_batch)


train_loader = DataLoader(
    stacked_dataset(CFG, documents=data.Document.tolist(), targets=data.category_id.tolist(), 
                  id2cls=id2cls, images_folder=IMAGES_FOLDER, 
                  product_ids=data.product_id.tolist(), clip_embeddings=clip_embeddings), 
    batch_size = 256, 
    shuffle = True, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

valid_loader = DataLoader(
    stacked_dataset(CFG, documents=data_valid.Document.tolist(), targets=data_valid.category_id.tolist(), 
                  id2cls=id2cls, images_folder=IMAGES_FOLDER, 
                  product_ids=data_valid.product_id.tolist(), clip_embeddings=clip_embeddings_valid), 
    batch_size = 512, 
    shuffle = False, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

full_loader = DataLoader(
    stacked_dataset(CFG, documents=data_full.Document.tolist(), targets=data_full.category_id.tolist(), 
                  id2cls=id2cls, images_folder=IMAGES_FOLDER, 
                  product_ids=data_full.product_id.tolist(), clip_embeddings=CFG.clip_embeddings), 
    batch_size = 512, 
    shuffle = False, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

test_loader = DataLoader(
    stacked_dataset(CFG, documents=data_test.Document.tolist(), targets=[2789] * len(data_test.product_id.tolist()), 
                  id2cls=id2cls, images_folder=IMAGES_FOLDER_TEST, 
                  product_ids=data_test.product_id.tolist(), clip_embeddings=CFG.clip_embeddings_test), 
    batch_size = 256, 
    shuffle = False, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

#### Train loop.

In [7]:
def train(model, train_loader, test_loader, 
          epochs=20, lr=0.0001, checkpoint_period=None, weight_decay=1e-4,
          warmup_epochs=2, gamma=0.93, verbose=True):
    
    opt = AdamW(model.parameters(), lr=lr * gamma ** -warmup_epochs, weight_decay=weight_decay)
    
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.01)   
    opt.zero_grad() 
    torch.cuda.empty_cache()
    # gc.collect()
    scheduler = ChainedScheduler([LinearLR(opt, start_factor=0.02, total_iters=warmup_epochs),
                                  ExponentialLR(opt, gamma=gamma)])
    if verbose:
            print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
    if checkpoint_period is None:
        checkpoint_period = len(train_loader)
    
    max_f1 = 0.
    best_epoch = 0
    
    for epoch in tqdm(range(1, epochs+1, 1)):
        # TRAIN
        model.train()
        loss_avg = 0.
        if verbose:
            print(f'\n Epoch={epoch}')
        for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), disable = not verbose):
            text_input = batch[0].to(device)
            image_input = batch[1].to(device)
            target = batch[2].to(device)
            output = model(text_input, image_input)
            loss = loss_fn(output, target)
            loss.backward()
            opt.step()
            opt.zero_grad()
            loss_avg += loss.item() / checkpoint_period
            if step % checkpoint_period == checkpoint_period - 1:
                if verbose:
                    print(f'Step={step+1}, Train loss={loss_avg:.6f}')
                loss_avg = 0.
                model.eval()
                grun_truth = []
                predicted = []                
                with torch.no_grad():
                    precision = 0.
                    recall = 0.
                    total = min(len(test_loader), (checkpoint_period // 2))
                    for step, batch in enumerate(test_loader):
                        text_input = batch[0].to(device)
                        image_input = batch[1].to(device)
                        target = batch[2].to(device)
                        output = model(text_input, image_input)
                        loss = loss_fn(output, target)
                        loss_avg += loss.item() / total
                        grun_truth.append(target.cpu())
                        predicted.append(output.argmax(dim=1).cpu())
                        if step >= checkpoint_period // 2 - 1:
                            break
                weighted_f1 = f1_score(np.concatenate(grun_truth), np.concatenate(predicted), average='weighted')
                
                if weighted_f1 > max_f1:
                    max_f1 = weighted_f1
                    best_epoch = epoch
                    torch.save(model.state_dict(), os.path.join(WORKDIR, 'checkpoints/best_stack.pt'))

                if verbose:
                    print(f"F1={weighted_f1:.5f}")
                    print(f'Eval loss={loss_avg:.5f}\n')
                loss_avg = 0.
                model.train()
                scheduler.step()
                if verbose:
                    print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
    return max_f1, best_epoch

#### Создание модели.

In [8]:
model = STACKED_CLF(CFG, len(cls2id), img_emb_mult=0.001).to(device)
if CFG.state_dict is not None:
    model.load_state_dict(CFG.state_dict)
torch.cuda.empty_cache()
gc.collect()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


36

#### Обучение модели.

In [11]:
train(model, train_loader, valid_loader, 
      warmup_epochs=2, epochs=3, lr=0.00025, gamma=0.93) 
train(model, train_loader, valid_loader, 
      warmup_epochs=2, epochs=20, lr=0.0002, gamma=0.93) 

Lr: 0.000004625


  0%|          | 0/20 [00:00<?, ?it/s]


 Epoch=1


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=3.607946
F1=0.58541
Eval loss=2.91270

Lr: 0.000109677

 Epoch=2


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.621298
F1=0.89366
Eval loss=0.51924

Lr: 0.000200000

 Epoch=3


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.226212
F1=0.90193
Eval loss=0.48003

Lr: 0.000186000

 Epoch=4


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.171213
F1=0.90349
Eval loss=0.47422

Lr: 0.000172980

 Epoch=5


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.154754
F1=0.90876
Eval loss=0.48116

Lr: 0.000160871

 Epoch=6


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.147399
F1=0.91401
Eval loss=0.48647

Lr: 0.000149610

 Epoch=7


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.143692
F1=0.91064
Eval loss=0.49526

Lr: 0.000139138

 Epoch=8


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.141393
F1=0.90807
Eval loss=0.50745

Lr: 0.000129398

 Epoch=9


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.140379
F1=0.90852
Eval loss=0.50873

Lr: 0.000120340

 Epoch=10


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.138634
F1=0.90846
Eval loss=0.51588

Lr: 0.000111916

 Epoch=11


  0%|          | 0/348 [00:00<?, ?it/s]

Step=348, Train loss=0.137302
F1=0.90909
Eval loss=0.51885

Lr: 0.000104082

 Epoch=12


  0%|          | 0/348 [00:00<?, ?it/s]

KeyboardInterrupt: 

### SUBMIT

#### Загружаем лучший чекпоинт модели.

In [12]:
model.load_state_dict(torch.load(os.path.join(WORKDIR, 'checkpoints/best_stack.pt')))

<All keys matched successfully>

Функция для проверки и генерации ответов (на всякий случай, проверяем, что метрики соответствуют ожидаемым метрикам лучшего чекпоинта)

In [13]:
def get_predictions_eval(model, test_loader) -> np.array:
    model.eval()
    total = len(test_loader)
    predicted = []
    grun_truth = []
    for step, batch in tqdm(enumerate(test_loader), total=total):
        with torch.no_grad():
            text_input = batch[0].to(device)
            image_input = batch[1].to(device)
            target = batch[2].to(device)
            output = model(text_input, image_input)
            predicted.append(output.argmax(dim=1).cpu())
            grun_truth.append(target.cpu())
    weighted_f1 = f1_score(np.concatenate(grun_truth), np.concatenate(predicted), average='weighted')
    print(f"F1={weighted_f1:.5f}")
    return np.concatenate(predicted)

Делаем проверки:

In [None]:
predictions = get_predictions_eval(model, full_loader)

  0%|          | 0/178 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fbf133a5ee0>
Traceback (most recent call last):
  File "/home/ubuntu/anaconda/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1466, in __del__
    self._shutdown_workers()
  File "/home/ubuntu/anaconda/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1449, in _shutdown_workers
    if w.is_alive():
  File "/home/ubuntu/anaconda/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fbf133a5ee0>

Exception ignored in: AssertionErrorTraceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fbf133a5ee0>
  File "/home/ubuntu/anaconda/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1466, in __del__
    self._shutdown_workers()
  File "/home/ubuntu/anaconda/lib/python3.9/sit

In [None]:
model.img_emb_mult=0.001
_ = get_predictions_eval(model, valid_loader)

Генерируем предсказания для тестовой выборки:

In [None]:
predictions_test = get_predictions_eval(model, test_loader)

Смотрим, что получилось:

In [None]:
data_full['predictions'] = [cls2id[x] for x in predictions]

In [None]:
data_full[data_full.category_id == data_full.predictions]

In [None]:
data_full[data_full.category_id != data_full.predictions]

Смотрим, что получили на тестовой выборке (проверяем адекватность предикта глазами):

In [None]:
data_test['predicted_category_id'] = [cls2id[x] for x in predictions_test]
data_test['predicted_category'] = [id2category[cls2id[x]] for x in predictions_test]
data_test.to_csv('result_watch.csv') 
data_test

Готовим данные для сохранения в соответствии с требованиями задания:

In [None]:
data_test.drop(columns=['Document', 'predicted_category'], inplace=True)
data_test

In [None]:
data_test.to_parquet('result.parquet')

Проверим, как таблица сохранилась:

In [None]:
pd.read_parquet('result.parquet')