In [1]:
import os
import gc
import random
import multiprocessing
import warnings

from tqdm.auto import tqdm
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.optim.lr_scheduler import ChainedScheduler, LinearLR, ExponentialLR
from torch.nn import CrossEntropyLoss

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DataCollatorWithPadding

from utils import get_title, preprocess_text_field, MeanPooling

def seed_everything(seed=42, deterministic=False):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = deterministic
    torch.backends.cudnn.benchmark = False

#### Основные настройки: seed, модель, рабочий каталог, warnings.

In [2]:
SEED = 42
WORKDIR = '//kaggle/input/kazan-exress-1/'
warnings.filterwarnings("ignore")
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
seed_everything(SEED, deterministic=True)

%env TOKENIZERS_PARALLELISM=false

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)
print('CPU cores: ', multiprocessing.cpu_count())

# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = multiprocessing.cpu_count()
    # model = "sentence-transformers/all-mpnet-base-v2"
    model = 'cointegrated/rubert-tiny2' # os.path.join(WORKDIR, 'LearningEquality/cosineloss_new32')
    tokenizer = AutoTokenizer.from_pretrained(model)
    state_dict = None
    max_length = 256

env: TOKENIZERS_PARALLELISM=false
Device:  cuda
CPU cores:  2


#### Преобразование входных данных.

In [3]:
# Read from parquet
data_full = pd.read_parquet("/kaggle/input/kazan-exress-1/row_data/train.parquet")
# Drop unnecessary columns
data_full.drop(columns=['shop_id', 'rating'], inplace=True)
# Convert text fields
data_full['title'] = data_full.text_fields.apply(get_title)
data_full.text_fields = data_full.text_fields.apply(preprocess_text_field)
# Convert "Sale"
data_full['sale'] = data_full['sale'].apply(lambda x: "Распродажа!" if x else "")  
data_full.fillna(value='', inplace=True)
# Concatenate to one string
data_full = data_full.assign(Document=[str(y) + ': ' + str(x) + '. ' + str(z) + '. ' + str(s) + '. ' \
                                       for x, y, z, s in zip(data_full['title'], data_full['shop_title'],
                                                           data_full['text_fields'], data_full['sale'])])

data_full = data_full.drop(columns=['text_fields', 'shop_title', 'sale', 'title']).reset_index(drop=True)
# Drop "product_id" column - only for train
data_full.drop(columns=['product_id'], inplace=True)
# Drop too rare values
drop_ids = set(data_full.category_id.value_counts()[data_full.category_id.value_counts() < 2].index)
data_full = data_full[~data_full['category_id'].isin(drop_ids)]
# Trait/test split
data, data_valid = train_test_split(data_full, test_size=0.2, random_state=SEED, 
                                    shuffle=True, stratify=data_full.category_id)
data.reset_index(drop=True, inplace=True)
data_valid.reset_index(drop=True, inplace=True)
# Fix class umbers 
cls2id = data_full.category_id.unique()
id2cls = {k : v for v, k in enumerate(cls2id)}

del data_full

#### Классы модели и датасета.

In [4]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length=cfg.max_length,
        truncation=True
    )
    return inputs

# =========================================================================================
# Dataset
# =========================================================================================
class doc_dataset(Dataset):
    def __init__(self, documents:list, targets: list, id2cls: dict, cfg):
        self.cfg = cfg
        self.data = documents
        self.targets = targets
        self.id2cls = id2cls
    def __len__(self):
        return len(self.data)
    def __getitem__(self, item):
        return prepare_input(self.data[item], self.cfg), self.id2cls[self.targets[item]]

# =========================================================================================
# Unsupervised model
# =========================================================================================
class BERT_CLF(nn.Module):
    def __init__(self, cfg, n_classes, cut_emb=None, emb_act=None, bottleneck_size=None, bottleneck_act=None):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
        # Cutting of BERT embeddings
        if cut_emb is not None:
            self.in_size = cut_emb
            self.cut_emb = True
        else:
            self.in_size = self.model.config.hidden_size
            self.cut_emb = False
        # Activation for BERT embeddings
        if emb_act is None:
            self.emb_bn = nn.Identity()
            self.emb_act = nn.Identity()
        else:
            self.emb_bn = nn.BatchNorm1d(self.in_size)
            self.emb_act = emb_act
        # Additional bottleneck layer
        if bottleneck_size is None:
            bott_out_size = self.in_size
            self.bott_act = nn.Identity()
            self.bott_fc = nn.Identity()
        else:
            bott_out_size = bottleneck_size
            self.bott_act = bottleneck_act
            self.bott_fc = nn.Linear(self.in_size, bott_out_size)
        
        # Classifier
        self.clf = nn.Linear(bott_out_size, n_classes)

    def forward(self, inputs):
        emb = self.pool(self.model(**inputs).last_hidden_state, inputs['attention_mask'])
        
        if self.cut_emb:
            emb = emb[:, :self.in_size]
        
        x = self.emb_act(self.emb_bn(emb))
        x = self.bott_act(self.bott_fc(x))
        
        cls  = self.clf(x)
        return cls

#### Даталоадер и функция collate_fn

In [5]:
transformers_collator = DataCollatorWithPadding(tokenizer = CFG.tokenizer, padding = 'longest')

def custom_collate(batch):
    doc_batch = []
    targets_batch = []
    for pair in batch:
        doc_batch.append(pair[0])
        targets_batch.append(pair[1])
    return transformers_collator(doc_batch), torch.tensor(targets_batch)

train_loader = DataLoader(
    doc_dataset(data.Document.tolist(), data.category_id.tolist(), id2cls, CFG), 
    batch_size = 384, 
    shuffle = True, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

valid_loader = DataLoader(
    doc_dataset(data_valid.Document.tolist(), data_valid.category_id.tolist(), id2cls, CFG), 
    batch_size = 384, 
    shuffle = False, 
    collate_fn = custom_collate,
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

#### Train loop.

In [6]:
def train(model, train_loader, test_loader, weight_decay=1e-6,
          epochs=2, lr=0.0001, checkpoint_period=None, 
          warmup_epochs=3, gamma=0.925, verbose=True):
    
    opt = AdamW(model.parameters(), lr=lr * gamma ** -warmup_epochs, weight_decay=weight_decay)
    
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.02)   
    opt.zero_grad() 
    torch.cuda.empty_cache()
    gc.collect()
    scheduler = ChainedScheduler([LinearLR(opt, start_factor=0.01, total_iters=warmup_epochs),
                                  ExponentialLR(opt, gamma=gamma)])
    if checkpoint_period is None:
        checkpoint_period = len(train_loader)
    
    max_f1 = 0.
    best_epoch = 0
    
    for epoch in tqdm(range(1, epochs+1, 1)):
        # TRAIN
        model.train()
        loss_avg = 0.
        if verbose:
            print(f'Epoch={epoch}')
            print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
        for step, batch in tqdm(enumerate(train_loader), total=len(train_loader), disable = not verbose):
            input = batch[0].to(device)
            target = batch[1].to(device)
            output = model(input)
            loss = loss_fn(output, target)
            loss.backward()
            opt.step()
            opt.zero_grad()
            loss_avg += loss.item() / checkpoint_period
            if step % checkpoint_period == checkpoint_period - 1:
                if verbose:
                    print(f'Step={step+1}, Train loss={loss_avg:.6f}')
                loss_avg = 0.
                torch.save(model.state_dict(), '/kaggle/working/checkpoint.pt')
                model.eval()
                grun_truth = []
                predicted = []                
                with torch.no_grad():
                    precision = 0.
                    recall = 0.
                    total = min(len(test_loader), (checkpoint_period // 2))
                    for step, batch in enumerate(test_loader):
                        input = batch[0].to(device)
                        target = batch[1].to(device)
                        output = model(input)
                        loss = loss_fn(output, target)
                        loss_avg += loss.item() / total
                        grun_truth.append(target.cpu())
                        predicted.append(output.argmax(dim=1).cpu())
                        if step >= checkpoint_period // 2 - 1:
                            break
                weighted_f1 = f1_score(np.concatenate(grun_truth), np.concatenate(predicted), average='weighted')
                if weighted_f1 > max_f1:
                    max_f1 = weighted_f1
                    best_epoch = epoch
                    torch.save(model.state_dict(), os.path.join(WORKDIR, 'best.pt'))
                if verbose:
                    print(f"F1={weighted_f1:.5f}")
                    print(f'Eval loss={loss_avg:.5f}\n')
                loss_avg = 0.
                model.train()
                scheduler.step()
                print(f'Lr: {scheduler.get_last_lr()[0]:.9f}')
    return max_f1, best_epoch

In [7]:
model = BERT_CLF(CFG, len(cls2id)).to(device)
if CFG.state_dict is not None:
    model.load_state_dict(CFG.state_dict)
torch.cuda.empty_cache()
gc.collect()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


148

#### Обучение модели.

In [8]:
train(model, train_loader, valid_loader, warmup_epochs=2, epochs=14, lr=0.0008)

  0%|          | 0/14 [00:00<?, ?it/s]

Epoch=1
Lr: 0.000009350


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=6.317847
F1=0.03371
Eval loss=5.91598

Lr: 0.000436757
Epoch=2
Lr: 0.000436757


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=2.062790
F1=0.81480
Eval loss=0.93279

Lr: 0.000800000
Epoch=3
Lr: 0.000800000


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.780800
F1=0.85778
Eval loss=0.76492

Lr: 0.000740000
Epoch=4
Lr: 0.000740000


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.500762
F1=0.86809
Eval loss=0.75219

Lr: 0.000684500
Epoch=5
Lr: 0.000684500


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.367809
F1=0.87432
Eval loss=0.76172

Lr: 0.000633163
Epoch=6
Lr: 0.000633163


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.309968
F1=0.87982
Eval loss=0.77572

Lr: 0.000585675
Epoch=7
Lr: 0.000585675


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.282066
F1=0.87982
Eval loss=0.79210

Lr: 0.000541750
Epoch=8
Lr: 0.000541750


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.270308
F1=0.88205
Eval loss=0.78881

Lr: 0.000501118
Epoch=9
Lr: 0.000501118


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.263380
F1=0.88186
Eval loss=0.80188

Lr: 0.000463535
Epoch=10
Lr: 0.000463535


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.259345
F1=0.88305
Eval loss=0.80466

Lr: 0.000428769
Epoch=11
Lr: 0.000428769


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.256448
F1=0.88479
Eval loss=0.80578

Lr: 0.000396612
Epoch=12
Lr: 0.000396612


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.253435
F1=0.88260
Eval loss=0.81456

Lr: 0.000366866
Epoch=13
Lr: 0.000366866


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.251561
F1=0.88406
Eval loss=0.81530

Lr: 0.000339351
Epoch=14
Lr: 0.000339351


  0%|          | 0/190 [00:00<?, ?it/s]

Step=190, Train loss=0.249865
F1=0.88487
Eval loss=0.81790

Lr: 0.000313900


(0.8848723999530534, 14)

In [9]:
del model
torch.cuda.empty_cache()
gc.collect()

1000

In [None]:
model.model.save_pretrained(os.path.join(WORKDIR, 'ruberttiny2_pretrained_08data'))
CFG.tokenizer.save_pretrained(os.path.join(WORKDIR, "ruberttiny2_pretrained_08data/tokenizer/"))

### Подбор гиперпараметров.

In [None]:
def try_parameters(lr=0.0008, epochs=20, warmup_epochs=2, cut_emb=None, weight_decay=1e-6,
                   emb_act=None, bottleneck_size=None, bottleneck_act=None,
                   verbose=False):
    
    model = BERT_CLF(CFG, len(cls2id), cut_emb=cut_emb, 
                     emb_act=emb_act, bottleneck_size=bottleneck_size, bottleneck_act=bottleneck_act
                     ).to(device)
    
    torch.cuda.empty_cache()
    gc.collect()
    
    f1, epoch = train(model, train_loader, valid_loader, weight_decay=weight_decay,
               warmup_epochs=warmup_epochs, epochs=epochs, lr=lr,
               verbose=verbose)
    print(f"lr={lr}, epochs={epochs}, warmup_epochs={warmup_epochs}, emb_act={emb_act}, " 
          f"bottleneck_size={bottleneck_size}, bottleneck_act={bottleneck_act}, weight_decay={weight_decay}")
    print(f"Best F1={f1:.5f}, Best epoch={epoch}\n")

In [None]:
for weight_decay in tqdm([0.1, 0.001, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]):
    for lr in [0.0008, 0.0015, 0.0004]:
        try_parameters(lr=lr, epochs=20, warmup_epochs=2, weight_decay=weight_decay, verbose=False)