In [1]:
import os
import re
import gc
import json
import random
import multiprocessing
import warnings
from tqdm.auto import tqdm

from PIL import Image
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ChainedScheduler, LinearLR, ExponentialLR
from torch.nn import CrossEntropyLoss
from torch.utils.data import default_collate

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizerFast

def seed_everything(seed=42, deterministic=False):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = deterministic
    torch.backends.cudnn.benchmark = False

#### Основные настройки: seed, модель, рабочий каталог, warnings.

In [19]:
SEED = 42
WORKDIR = '//home/ubuntu/gitrepo/KazanExpress/2/'
IMAGES_FOLDER = os.path.join(WORKDIR, 'row_data/images/train/')
IMAGES_FOLDER_TEST = os.path.join(WORKDIR, 'row_data/images/test/')
warnings.filterwarnings("ignore")
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
seed_everything(SEED)

%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)
print('CPU cores: ', multiprocessing.cpu_count())

# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = multiprocessing.cpu_count()
    model = "openai/clip-vit-large-patch14" 
    tokenizer = CLIPTokenizerFast.from_pretrained(model)
    processor = CLIPProcessor.from_pretrained(model)
    state_dict = None 
    max_length = 77

env: TOKENIZERS_PARALLELISM=false
Device:  cuda
CPU cores:  8


#### Преобразование входных данных.

In [3]:
# Read from parquet
data_full = pd.read_parquet(os.path.join(WORKDIR, 'row_data/train.parquet'))
# Drop unnecessary columns
data_full.drop(columns=['shop_id', 'rating'], inplace=True)
data_full = data_full.drop(columns=['text_fields', 'shop_title', 'sale']).reset_index(drop=True)
# Duplicate too rare values
dup_ids = set(data_full.category_id.value_counts()[data_full.category_id.value_counts() < 2].index)
data_full = data_full.append(data_full[data_full['category_id'].isin(dup_ids)])
# Trait/test split
data, data_valid = train_test_split(data_full, test_size=0.025, random_state=SEED, 
                                    shuffle=True, stratify=data_full.category_id)
data.reset_index(drop=True, inplace=True)
data_valid.reset_index(drop=True, inplace=True)
# Fix class numbers 
cls2id = data_full.category_id.unique()
id2cls = {k : v for v, k in enumerate(cls2id)}

#### Классы датасета и модели. 

In [4]:
# =========================================================================================
# Dataset
# =========================================================================================
class doc_dataset(Dataset):
    def __init__(self, targets: list, id2cls: dict, images_folder: str, product_ids:list, cfg):
        self.cfg = cfg
        self.targets = targets
        self.id2cls = id2cls
        self.images_folder = images_folder
        self.product_ids = product_ids
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, item):
        image_inputs = self.cfg.processor(
                text=None,
                images=Image.open(os.path.join(self.images_folder, str(self.product_ids[item]) + '.jpg')),
                return_tensors='pt'
            )['pixel_values'][0]
        return image_inputs, self.id2cls[self.targets[item]]

# =========================================================================================
# Image classification model
# =========================================================================================
class CLIP_CLF(nn.Module):
    def __init__(self, cfg, n_classes, cut_emb=None, emb_act=None, bottleneck_size=None, bottleneck_act=None):
        super().__init__()
        # Configurations, CLIP model loading
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = CLIPModel.from_pretrained(cfg.model)
        # Truncate CLIP embeddings for dimension reduction
        if cut_emb is not None:
            self.in_size = cut_emb
            self.cut_emb = True
        else:
            self.in_size = self.model.config.hidden_size
            self.cut_emb = False
        # Classifier layer
        self.clf = nn.Linear(self.in_size, n_classes)

    def forward(self, image_inputs):
        # Get CLIP embeddings
        emb = self.model.get_image_features(image_inputs)
        # Truncate CLIP embeddings for dimension reduction
        if self.cut_emb:
            emb = emb[:, :self.in_size]
        # Classifier
        cls = self.clf(emb)
        return cls
    
    def get_emb(self, image_inputs):
        emb = self.model.get_image_features(image_inputs)
        if self.cut_emb:
            emb = emb[:, :self.in_size]
        return emb

#### Функция collate_fn и даталоадер:

In [5]:
train_loader = DataLoader(
    doc_dataset(data.category_id.tolist(), id2cls, IMAGES_FOLDER, product_ids=data.product_id.tolist(), cfg=CFG), 
    batch_size = 24, 
    shuffle = True, 
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

valid_loader = DataLoader(
    doc_dataset(data_valid.category_id.tolist(), 
                id2cls, IMAGES_FOLDER, product_ids=data_valid.product_id.tolist(), cfg=CFG), 
    batch_size = 24, 
    shuffle = False, 
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

Train loop

In [6]:
def train(model, train_loader, test_loader, optimizer=None,
          epochs=2, lr=0.0001, checkpoint_period=None, 
          warmup_epochs=2, gamma=0.925, verbose=True):
    
    if optimizer is None:
        opt = Adam(model.parameters(), lr=lr * gamma ** -warmup_epochs)
    else:
        opt = optimizer
    
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()   
    opt.zero_grad() 
    torch.cuda.empty_cache()
    scheduler = ChainedScheduler([LinearLR(opt, start_factor=0.1, total_iters=warmup_epochs),
                                  ExponentialLR(opt, gamma=gamma)])
    if verbose:
            print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
    if checkpoint_period is None:
        checkpoint_period = len(train_loader)
    
    max_f1 = 0.
    best_epoch = 0
    
    for epoch in tqdm(range(1, epochs+1, 1)):
        # TRAIN
        model.train()
        loss_avg = 0.
        if verbose:
            print(f'Epoch={epoch}')
        for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), disable = not verbose):
            image_input = batch[0].to(device)
            target = batch[1].to(device)
            output = model(image_input)
            loss = loss_fn(output, target)
            loss.backward()
            opt.step()
            opt.zero_grad()
            loss_avg += loss.item() / checkpoint_period
            if step % checkpoint_period == checkpoint_period - 1:
                if verbose:
                    print(f'Step={step+1}, Train loss={loss_avg:.6f}')
                loss_avg = 0.
                torch.save(model.state_dict(), os.path.join(WORKDIR, 'checkpoints/checkpoint.pt'))
                model.eval()
                grun_truth = []
                predicted = []                
                with torch.no_grad():
                    total = min(len(test_loader), (checkpoint_period // 3))
                    for step, batch in enumerate(test_loader):
                        image_input = batch[0].to(device)
                        target = batch[1].to(device)
                        output = model(image_input)
                        loss = loss_fn(output, target)
                        loss_avg += loss.item() / total
                        grun_truth.append(target.cpu())
                        predicted.append(output.argmax(dim=1).cpu())
                        if step >= checkpoint_period // 3 - 1:
                            break
                weighted_f1 = f1_score(np.concatenate(grun_truth), np.concatenate(predicted), average='weighted')
                
                if weighted_f1 > max_f1:
                    max_f1 = weighted_f1
                    best_epoch = epoch
                    torch.save(model.state_dict(), os.path.join(WORKDIR, 'checkpoints/best.pt'))
                if verbose:
                    print(f"F1={weighted_f1:.5f}")
                    print(f'Eval loss={loss_avg:.5f}\n')
                loss_avg = 0.
                model.train()
                scheduler.step()
                print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
    return max_f1, best_epoch

Инициализация модели.

In [7]:
model = CLIP_CLF(CFG, len(cls2id), cut_emb=40).to(device)
if CFG.state_dict is not None:
    model.load_state_dict(CFG.state_dict)
torch.cuda.empty_cache()
gc.collect()

27

Обучаем модель:

In [8]:
train(model, train_loader, valid_loader, checkpoint_period=600,
      optimizer=None, warmup_epochs=7, epochs=20, lr=0.00001, gamma=0.96)

Lr: 0.000001331


  0%|          | 0/20 [00:00<?, ?it/s]

Epoch=1


  0%|          | 0/3702 [00:00<?, ?it/s]

Step=600, Train loss=5.987983
F1=0.20190
Eval loss=5.47663

Lr: 0.000002920
Step=1200, Train loss=4.966136
F1=0.32365
Eval loss=4.54845

Lr: 0.000004380
Step=1800, Train loss=4.223240
F1=0.41503
Eval loss=3.88285

Lr: 0.000005719
Step=2400, Train loss=3.709109
F1=0.47864
Eval loss=3.40966

Lr: 0.000006943
Step=3000, Train loss=3.297177
F1=0.51518
Eval loss=3.03627

Lr: 0.000008061
Step=3600, Train loss=3.010670
F1=0.54084
Eval loss=2.80314

Lr: 0.000009077
Epoch=2


  0%|          | 0/3702 [00:00<?, ?it/s]

Step=600, Train loss=2.421363
F1=0.57048
Eval loss=2.57067

Lr: 0.000010000
Step=1200, Train loss=2.301376
F1=0.59912
Eval loss=2.35871

Lr: 0.000009600
Step=1800, Train loss=2.179178
F1=0.61606
Eval loss=2.28136

Lr: 0.000009216
Step=2400, Train loss=2.097841
F1=0.62628
Eval loss=2.13938

Lr: 0.000008847
Step=3000, Train loss=1.944514
F1=0.64871
Eval loss=2.01130

Lr: 0.000008493
Step=3600, Train loss=1.876980
F1=0.65448
Eval loss=1.93320

Lr: 0.000008154
Epoch=3


  0%|          | 0/3702 [00:00<?, ?it/s]

Step=600, Train loss=1.166594
F1=0.68168
Eval loss=1.79811

Lr: 0.000007828
Step=1200, Train loss=1.134427
F1=0.68043
Eval loss=1.83395

Lr: 0.000007514
Step=1800, Train loss=1.131554
F1=0.68128
Eval loss=1.77260

Lr: 0.000007214
Step=2400, Train loss=1.085534
F1=0.68090
Eval loss=1.76795

Lr: 0.000006925
Step=3000, Train loss=1.062244
F1=0.69995
Eval loss=1.69791

Lr: 0.000006648
Step=3600, Train loss=1.045661
F1=0.69690
Eval loss=1.72919

Lr: 0.000006382
Epoch=4


  0%|          | 0/3702 [00:00<?, ?it/s]

Step=600, Train loss=0.522503
F1=0.72322
Eval loss=1.58967

Lr: 0.000006127
Step=1200, Train loss=0.490513
F1=0.72020
Eval loss=1.62255

Lr: 0.000005882
Step=1800, Train loss=0.511207
F1=0.71559
Eval loss=1.67291

Lr: 0.000005647
Step=2400, Train loss=0.509138
F1=0.71125
Eval loss=1.62463

Lr: 0.000005421
Step=3000, Train loss=0.490567
F1=0.70900
Eval loss=1.63183

Lr: 0.000005204
Step=3600, Train loss=0.486236
F1=0.71659
Eval loss=1.64753

Lr: 0.000004996
Epoch=5


  0%|          | 0/3702 [00:00<?, ?it/s]

Step=600, Train loss=0.192422
F1=0.72208
Eval loss=1.61230

Lr: 0.000004796
Step=1200, Train loss=0.175703
F1=0.72422
Eval loss=1.66686

Lr: 0.000004604
Step=1800, Train loss=0.200432
F1=0.72104
Eval loss=1.70886

Lr: 0.000004420
Step=2400, Train loss=0.187584
F1=0.71552
Eval loss=1.68251

Lr: 0.000004243
Step=3000, Train loss=0.205775
F1=0.72205
Eval loss=1.71206

Lr: 0.000004073
Step=3600, Train loss=0.195540
F1=0.72499
Eval loss=1.71598

Lr: 0.000003911
Epoch=6


  0%|          | 0/3702 [00:00<?, ?it/s]

Step=600, Train loss=0.082378
F1=0.72363
Eval loss=1.69552

Lr: 0.000003754
Step=1200, Train loss=0.073777
F1=0.72386
Eval loss=1.71780

Lr: 0.000003604
Step=1800, Train loss=0.079297
F1=0.72107
Eval loss=1.72934

Lr: 0.000003460
Step=2400, Train loss=0.077488
F1=0.71748
Eval loss=1.73873

Lr: 0.000003321
Step=3000, Train loss=0.092139
F1=0.71486
Eval loss=1.76727

Lr: 0.000003189
Step=3600, Train loss=0.078627
F1=0.72239
Eval loss=1.73462

Lr: 0.000003061
Epoch=7


  0%|          | 0/3702 [00:00<?, ?it/s]

KeyboardInterrupt: 

Выбираем лучший чекпоинт:

In [9]:
model.load_state_dict(torch.load(os.path.join(WORKDIR, 'checkpoints/best_clip.pt')))

<All keys matched successfully>

Функции для генерации и проверки эмбеддингов.

In [31]:
def get_full_embeddings_eval(model) -> np.array:
    full_loader = DataLoader(
    doc_dataset(data_full.category_id.tolist(), 
                id2cls, IMAGES_FOLDER, product_ids=data_full.product_id.tolist(), cfg=CFG), 
    batch_size = 24, 
    shuffle = False, 
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)
    outputs = []
    grun_truth = []
    predicted = []
    model.eval()
    total = len(full_loader)
    for step, batch in tqdm(enumerate(full_loader), total=total):
        with torch.no_grad():
            image_input = batch[0].to(device)
            target = batch[1].to(device)
            output = model.get_emb(image_input)
            outputs.append(model.clf(output).cpu().numpy())
            grun_truth.append(target.cpu())
            predicted.append(output.argmax(dim=1).cpu())
    weighted_f1 = f1_score(np.concatenate(grun_truth), np.concatenate(predicted), average='weighted')
    print(f"F1={weighted_f1:.5f}")
    return np.concatenate(outputs)

def get_embeddings(model, test_loader) -> np.array:
    outputs = []
    model.eval()
    total = len(test_loader)
    for step, batch in tqdm(enumerate(test_loader), total=total):
        with torch.no_grad():
            image_input = batch[0].to(device)
            output = model.get_emb(image_input)
            outputs.append(output.cpu().numpy())
    return np.concatenate(outputs)

Проверяем размерности и воспроизводимость генерации.

In [None]:
embeddings_new = get_full_embeddings_eval(model)

  0%|          | 0/3797 [00:00<?, ?it/s]

In [12]:
embeddings_new.shape

(91124, 40)

In [13]:
data_full.shape

(91124, 2)

In [15]:
embeddings_old = np.load('embeddings_clip.np.npy')

In [17]:
(embeddings_old - embeddings_new).std()

0.0

Читаем данные из тестового датасета

In [20]:
# Read from parquet
data_test = pd.read_parquet(os.path.join(WORKDIR, 'row_data/test.parquet'))
# Drop unnecessary columns
data_test.drop(columns=['shop_id', 'rating'], inplace=True)
data_test = data_test.drop(columns=['text_fields', 'shop_title', 'sale']).reset_index(drop=True)
data_test.reset_index(drop=True, inplace=True)

test_loader = DataLoader(
    doc_dataset([2789] * len(data_test.product_id.tolist()), id2cls, 
                IMAGES_FOLDER_TEST, product_ids=data_test.product_id.tolist(), cfg=CFG), 
    batch_size = 24, 
    shuffle = False, 
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

Генерируем эмбеддинги для тестового сета.

In [25]:
embeddings_test = get_embeddings(model, test_loader)

  0%|          | 0/703 [00:00<?, ?it/s]

In [26]:
embeddings_test.shape

(16860, 40)

In [27]:
data_test.shape

(16860, 1)

Сохраняем:

In [28]:
np.save('embeddings_clip_test.np', embeddings_test)