In [77]:
#!g1.4
%pip install transformers
%pip install wandb
%pip install optuna

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [83]:
#!g1.4
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, roc_auc_score
from tqdm.notebook import trange, tqdm
from IPython.display import clear_output
import wandb
import os
# import optuna

In [84]:
#!g1.4
class ConfigDict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [85]:
#!g1.4
config = ConfigDict({
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 10,
    'batch_size': 16,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'no freeeze',
    'test_size': 0.15,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'test_size': 0.15,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [86]:
#!g1.4
random.seed(config.seed)
np.random.seed(config.seed)
torch.manual_seed(config.seed)
torch.backends.cudnn.deterministic = True

In [87]:
#!g1.4
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(config.seed)

# Preprocessing

In [88]:
#!g1.4
train = pd.read_csv('/home/jupyter/mnt/s3/newbacketttt/train.csv', encoding = 'utf-8')

ctg_encoder = LabelEncoder()
train['1category'] = ctg_encoder.fit_transform(train['1category'])
snt_encoder = LabelEncoder()
train['sentiment'] = snt_encoder.fit_transform(train['sentiment'])

ctg_labels = train['1category'].unique()
snt_labels = train['sentiment'].unique()

x_ctg = train['sentence'].values
y_ctg = train['1category'].values
x_snt = train['sentence'].values
y_snt = train['sentiment'].values

In [89]:
#!g1.4
class Preprocessor():

    def __init__(self, tokenizer, config=config):
        self.config=config
        self.tokenizer = tokenizer
        
    def create_test_dataloader(self, x_test_final):
        
        x_test_final_enc = self.tokenizer.batch_encode_plus(
            x_test_final.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)

        
        input_ids_test_final = x_test_final_enc['input_ids']
        attention_masks_test_final = x_test_final_enc['attention_mask']

        dataset_test_final = TensorDataset(
            input_ids_test_final.to(config.device),
            attention_masks_test_final.to(self.config.device),
        )

        dataloader_test_final = DataLoader(
            dataset_test_final,
            batch_size = self.config.batch_size
        )

        return dataloader_test_final
    

    def create_dataloaders(self, x, y, x_test_final):
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, 
            test_size=self.config.test_size, 
            random_state=self.config.random_state, 
            stratify=y
        )

        x_train_enc = self.tokenizer.batch_encode_plus(
            x_train.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)

        x_test_enc = self.tokenizer.batch_encode_plus(
            x_test.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)
        
        x_test_final_enc = self.tokenizer.batch_encode_plus(
            x_test_final.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)

        input_ids_train = x_train_enc['input_ids']
        attention_masks_train = x_train_enc['attention_mask']
        labels_train = torch.tensor(y_train)

        input_ids_test = x_test_enc['input_ids']
        attention_masks_test = x_test_enc['attention_mask']
        labels_test = torch.tensor(y_test)
        
        indup_ids_test_final = x_test_final_enc['input_ids']
        attention_masks_test_final = x_test_final_enc['attention_mask']

        dataset_train = TensorDataset(
            input_ids_train.to(self.config.device), 
            attention_masks_train.to(self.config.device),
            labels_train.to(self.config.device)
        )

        dataset_test = TensorDataset(
            input_ids_test.to(self.config.device), 
            attention_masks_test.to(self.config.device),
            labels_test.to(self.config.device)
        )
        
        dataset_test_final = TensorDataset(
            input_ids_test_final.to(config.device),
            attention_masks_test_final.to(self.config.device),
        )

        dataloader_train = DataLoader(
            dataset_train,
            sampler=RandomSampler(dataset_train),
            batch_size=self.config.batch_size
        )

        dataloader_test = DataLoader(
            dataset_test,
            sampler=RandomSampler(dataset_test),
            batch_size=self.config.batch_size
        )
        
        dataloader_test_final = DataLoder(
            dataset_test_final,
            batch_size = self.config.batch_size
        )

        return dataloader_train, dataloader_test

# Training

In [90]:
#!g1.4
class Trainer():
    
    def __init__(self,  model, optimizer, scheduler, config=config):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.config = config

    def evaluate(self, dataloader):

        self.model.eval()

        with torch.no_grad(): 
            test_loss_total = 0
            y_pred_logits, y_pred_probs, y_true = [], [], []
            
            for batch in dataloader:
                
                batch = tuple(x.to(self.config.device) for x in batch)
                
                inputs = {
                    'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'labels':         batch[2],
                }
    
                outputs = self.model(**inputs)
                    
                batch_loss = outputs[0]
                logits = outputs[1]
                test_loss_total += batch_loss.item()

                probs = torch.softmax(logits, dim=1).cpu().numpy()
                logits = logits.cpu().numpy()
                label_ids = inputs['labels'].cpu().numpy()
                y_pred_probs.append(probs)
                y_pred_logits.append(logits)
                y_true.append(label_ids)
                
            test_loss = test_loss_total / len(dataloader) 
            y_pred_probs = np.concatenate(y_pred_probs, axis=0)
            y_pred_logits = np.concatenate(y_pred_logits, axis=0)
            y_true = np.concatenate(y_true, axis=0)

            return test_loss, y_pred_probs, y_pred_logits, y_true


    def train(self, dataloader_train, dataloader_test, save=False):

        step = 0
        for epoch in trange(self.config.epochs, desc=f"Traning Model on {self.config.epochs} Epochs"):

            self.model.train()
            train_loss_total = 0

            for batch in tqdm(dataloader_train, desc=f'Epoch {epoch}'):

                batch = tuple(b.to(self.config.device) for b in batch)
                
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2],
                }       

                outputs = self.model(**inputs)
                
                loss = outputs[0]
                train_loss_total += loss.item()
                
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                self.optimizer.step()
                self.scheduler.step()

                if step % 100 == 0:
                    wandb.log({
                        'batch_loss': loss.item(), 
                        'learning_rate': self.scheduler.get_last_lr()[0]
                    })
                step += 1

            if save:
                torch.save(self.model.state_dict(), f'_BERT_epoch_{epoch}_{self.config.model_id}.model')
                
            train_loss = train_loss_total / len(dataloader_train)
            test_loss, y_pred_probs, y_pred_logits, y_true = self.evaluate(dataloader_test)
            test_roc_auc = roc_auc_score(y_true, y_pred_probs, multi_class='ovr', labels=range(self.model.num_labels))
            wandb.log({
                    'train_loss': train_loss, 
                    'test_loss': test_loss,
                    'test_roc_auc': test_roc_auc
                })

In [91]:
#!g1.4
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhalaction[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Sentiment models

## 1

In [100]:
#!g1.4
config_0 = ConfigDict({
    'model_id': 'snt_0',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 10,
    'batch_size': 16,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'no freeeze',
    'test_size': 0.15,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'test_size': 0.15,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [66]:
#!g1.4
config = config_0

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

## 2

In [68]:
#!g1.4
config_snt_1 = ConfigDict({
    'model_id': 'snt_1',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 16,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'embedding freeze',
    'test_size': 0.15,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'test_size': 0.15,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [69]:
#!g1.4
config = config_snt_1

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

for param in model_snt.bert.parameters():
    param.requires_grad = False

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

## 3

In [None]:
#!g1.4
config_snt_2 = ConfigDict({
    'model_id': 'snt_2',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 8,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'embedding freeze',
    'test_size': 0.25,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [None]:
#!g1.4
config = config_snt_2

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

for param in model_snt.bert.parameters():
    param.requires_grad = False

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

## 4



In [91]:
#!g1.4
config_snt_3 = ConfigDict({
    'model_id': 'snt_3',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 250,
    'batch_size': 8,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'no freeze',
    'test_size': 0.2,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [None]:
#!g1.4
config = config_snt_3

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=200,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

wandb.finish()

In [None]:
Larger dropout (up to 0.5), in low-resource setups word dropout (i.e., randomly masking input tokens) also sometimes help (0.1-0.3 might be reasonable values).
If you have many input classes, label smoothing can help.

We may set different lrs for layers

## 5

In [119]:
#!g1.4
torch.device

torch.device

In [144]:
#!g1.4
config_snt_4 = ConfigDict({
    'model_id': 'snt_4',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 250,
    'batch_size': 8,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-6,
    'pretrained_model': 'ai-forever/sbert_large_mt_nlu_ru',
    'tokenizer': 'ai-forever/sbert_large_mt_nlu_ru',
    'info': 'no freeze',
    'test_size': 0.2,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [145]:
#!g1.4
config = config_snt_4

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=config.num_warmup_steps,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

wandb.finish()

0,1
batch_loss,█▅▇▇▂▇▂▂▂▆▁▄▁▄▃▁▅▆▆▅▃▄▂▅▇▄▁▅▄▇▅▃█▄▅▇▆▃▇▆
learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
test_loss,▂▁█▂▂▃▁▃▃▂▂▄▃▄▃▂▃▃▃▂▂▃▂▃▂▂▁▃▃▃▂▄▃▃▂▃▃▂▃▃
test_roc_auc,▆▇▃▆██▇█▁▇▇▅▆▆▇▇▆▇▆█▇▆▇▇▆█▇▇▇▇█▇█▇█▆▇█▇▇
train_loss,█▃▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▂▂▁▂▁▁▂▁▁▁▂

0,1
batch_loss,0.26138
learning_rate,0.0004
test_loss,0.52878
test_roc_auc,0.91427
train_loss,0.34712


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668719283340277, max=1.0…

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/sbert_large_mt_nlu_ru and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(HTML(value='Traning Model on 250 Epochs'), FloatProgress(value=0.0, max=250.0), HTML(value=''))…

HBox(children=(HTML(value='Epoch 0'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))

Exception ignored in: <generator object tqdm.__iter__ at 0x7f6bc077add0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/tqdm/std.py", line 1227, in __iter__
    self.close()
  File "/usr/local/lib/python3.8/dist-packages/tqdm/notebook.py", line 257, in close
    super(tqdm_notebook, self).close(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/tqdm/std.py", line 1332, in close
    fp_write('')
  File "/usr/local/lib/python3.8/dist-packages/tqdm/std.py", line 1329, in fp_write
    self.fp.write(_unicode(s))
  File "/usr/local/lib/python3.8/dist-packages/tqdm/utils.py", line 226, in inner
    return func(*args, **kwargs)
  File "/home/jupyter/.local/lib/python3.8/site-packages/wandb/sdk/lib/redirect.py", line 643, in write
    cb(data)
  File "/home/jupyter/.local/lib/python3.8/site-packages/wandb/sdk/wandb_run.py", line 2081, in <lambda>
    lambda data: self._console_raw_callback("stdout", data),
  File "/home/jupyter/.local/lib/python3.




KeyboardInterrupt: 

In [None]:
#!g1.4


## 6

In [None]:
#!g1.4


In [125]:
#!g1.4
config_snt_5 = ConfigDict({
    'model_id': 'snt_5',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 250,
    'batch_size': 8,
    'max_length': 256,
    'lr': 5e-4,
    'eps': 1e-8,
    'pretrained_model': 'sismetanin/sbert-ru-sentiment-rusentiment',
    'tokenizer': 'sismetanin/sbert-ru-sentiment-rusentiment',
    'info': 'no freeze',
    'test_size': 0.2,
    'optimizer': 'AdamW',
    'scheduler': get_linear_schedule_with_warmup,
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [126]:
#!g1.4
config = config_snt_5

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=config.num_warmup_steps,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

wandb.finish()

0,1
batch_loss,▆▆▁▁█▃▂▅
learning_rate,▁███████

0,1
batch_loss,1.1795
learning_rate,0.0003


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666930216666363, max=1.0)…

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1147.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1707820745.0), HTML(value='')))




RuntimeError: Error(s) in loading state_dict for BertForSequenceClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([5, 1024]) from checkpoint, the shape in current model is torch.Size([3, 1024]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([5]) from checkpoint, the shape in current model is torch.Size([3]).

In [None]:
#!g1.4


In [None]:
#!g1.4


## 7

In [131]:
#!g1.4
config_snt_6 = ConfigDict({
    'model_id': 'snt_6',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 250,
    'batch_size': 8,
    'max_length': 256,
    'lr': 5e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny-sentiment-balanced',
    'tokenizer': 'cointegrated/rubert-tiny-sentiment-balanced',
    'info': 'no freeze',
    'test_size': 0.2,
    'optimizer': 'AdamW',
    'scheduler': get_linear_schedule_with_warmup,
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [None]:
#!g1.4
Adam(
    [
        {"params": model.fc.parameters(), "lr": 1e-3},
        {"params": model.agroupoflayer.parameters()},
        {"params": model.lastlayer.parameters(), "lr": 4e-2},
    ],
    lr=5e-4,
)

In [132]:
#!g1.4
config = config_snt_6

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)
wandb.watch(model_snt)

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    model_snt.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=config.num_warmup_steps,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

wandb.finish()

VBox(children=(Label(value='0.004 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.740722…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666936411666029, max=1.0)…

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=884.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=47166007.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=377.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=241082.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=468029.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))






HBox(children=(HTML(value='Traning Model on 250 Epochs'), FloatProgress(value=0.0, max=250.0), HTML(value=''))…

HBox(children=(HTML(value='Epoch 0'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 4'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 5'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 6'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 7'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 8'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 9'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 10'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 11'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 12'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 13'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 14'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 15'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 16'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 17'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 18'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 19'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 20'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 21'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 22'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 23'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 24'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 25'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 26'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 27'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 28'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 29'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 30'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 31'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 32'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 33'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 34'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 35'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 36'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 37'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 38'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 39'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 40'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 41'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 42'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 43'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 44'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 45'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 46'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 47'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 48'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))





KeyboardInterrupt: 

In [None]:
#!g1.4


## 8

In [11]:
#!g1.4
config_snt_7 = ConfigDict({
    'model_id': 'snt_7',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 16,
    'max_length': 256,
    'pretrained_lr': 5e-6,
    'classifier_lr': 5e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny-sentiment-balanced',
    'tokenizer': 'cointegrated/rubert-tiny-sentiment-balanced',
    'info': 'freeze embedding + diff lr + dropout',
    'test_size': 0.2,
    'optimizer': 'AdamW',
    'scheduler': 'get_linear_schedule_with_warmup',
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

config = config_snt_7

In [12]:
#!g1.4
config = config_snt_7

wandb.init(project='hse-hack', config=config)

model_snt = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False, 
    attention_probs_dropout_prob = 0.2,
    classifier_dropout = 0.2
).to(config.device)
wandb.watch(model_snt)

for param in model_snt.bert.embeddings.parameters():
    param.requires_grad = False

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_snt, y_snt)

optimizer_snt = AdamW(
    [
        {"params": model_snt.bert.parameters(), "lr": config.pretrained_lr},
        {"params": model_snt.dropout.parameters(), "lr": config.classifier_lr},
        {"params": model_snt.classifier.parameters(), "lr": config.classifier_lr},
    ],
    lr=config.classifier_lr, 
    eps=config.eps
)
                  
scheduler_snt = get_linear_schedule_with_warmup(
    optimizer_snt, 
    num_warmup_steps=config.num_warmup_steps,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test)

wandb.finish()

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=884.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=47166007.0), HTML(value='')))




KeyboardInterrupt: 

In [159]:
#!g1.4
scheduler_snt.get_lr()



[1.801750972762646e-06, 0.0001801750972762646, 0.0001801750972762646]

In [161]:
#!g1.4
Trainer(model_snt, optimizer_snt, scheduler_snt, config=config).train(dataloader_train, dataloader_test, save=True)

HBox(children=(HTML(value='Traning Model on 100 Epochs'), FloatProgress(value=0.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 0'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 4'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 5'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 6'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 7'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 8'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 9'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))





KeyboardInterrupt: 

Добились нужного результата, сохраняем последние несколько моделей, берем среднее по их весам (распространенная техника для трансформеров)

In [92]:
#!g1.4

config_snt_7 = ConfigDict({
    'model_id': 'snt_7',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 16,
    'max_length': 256,
    'pretrained_lr': 5e-6,
    'classifier_lr': 5e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny-sentiment-balanced',
    'tokenizer': 'cointegrated/rubert-tiny-sentiment-balanced',
    'info': 'freeze embedding + diff lr + dropout',
    'test_size': 0.2,
    'optimizer': 'AdamW',
    'scheduler': 'get_linear_schedule_with_warmup',
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

config = config_snt_7

final_snt_model = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(snt_labels),
    output_attentions = False,
    output_hidden_states = False, 
    attention_probs_dropout_prob = 0.2,
    classifier_dropout = 0.2
).to(config.device)

for i in range(3):
    
    model = BertForSequenceClassification.from_pretrained(
        config.pretrained_model, 
        num_labels = len(snt_labels),
        output_attentions = False,
        output_hidden_states = False, 
        attention_probs_dropout_prob = 0.2,
        classifier_dropout = 0.2
    ).to(config.device)
    
    PATH = f'_BERT_epoch_{8-i}_snt_7.model'
    model.load_state_dict(torch.load(PATH))
    
    for key in final_snt_model.state_dict():
        final_snt_model.state_dict()[key] += model.state_dict()[key]
        
        if i==4:
            final_snt_model.state_dict()[key] = final_snt_model.state_dict()[key] / 3
            
            
final_snt_model.to(config.device)

final_snt_model.load_state_dict(torch.load('/home/jupyter/mnt/s3/newbacketttt/snt-models/_BERT_epoch_8_snt_7.model'))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=884.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=47166007.0), HTML(value='')))




<All keys matched successfully>

In [93]:
#!g1.4
config.device

device(type='cuda', index=0)

In [94]:
#!g1.4

test = pd.read_csv('/home/jupyter/mnt/s3/newbacketttt/1sentencenewtest.csv', encoding = 'utf-8')

test = test['0'].values

tokenizer_snt = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_snt)

dataloader = preprocessor.create_test_dataloader(test)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=377.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=241082.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=468029.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))






In [95]:
#!g1.4

def evaluate_test(model, dataloader):

    model.eval()

    with torch.no_grad(): 
        y_pred_logits, y_pred_probs = [], []

        for batch in dataloader:

            batch = tuple(x.to(config.device) for x in batch)

            inputs = {
                'input_ids':      batch[0].to(config.device),
                'attention_mask': batch[1].to(config.device),
            }

            outputs = final_snt_model(**inputs) 

            logits = outputs.logits

            probs = torch.softmax(logits, dim=1).cpu().numpy()
            y_pred_probs.append(probs)
            
        y_pred_probs = np.concatenate(y_pred_probs, axis=0)

        return y_pred_probs

In [96]:
#!g1.4

probs = evaluate_test(final_snt_model, dataloader)

In [97]:
#!g1.4

probs

array([[3.8141543e-03, 9.9555308e-01, 6.3277158e-04],
       [6.6593086e-05, 8.1703179e-03, 9.9176311e-01],
       [1.5810962e-01, 8.1000364e-01, 3.1886775e-02],
       ...,
       [1.7978348e-02, 9.8169965e-01, 3.2200234e-04],
       [7.6743891e-06, 2.5914246e-04, 9.9973327e-01],
       [2.5256628e-02, 3.9994091e-01, 5.7480240e-01]], dtype=float32)

In [98]:
#!g1.4
result = pd.DataFrame(probs, columns=snt_encoder.inverse_transform(range(3)))

result

Unnamed: 0,+,?,−
0,0.003814,0.995553,0.000633
1,0.000067,0.008170,0.991763
2,0.158110,0.810004,0.031887
3,0.003214,0.081713,0.915073
4,0.998501,0.000986,0.000513
...,...,...,...
944,0.000032,0.000805,0.999163
945,0.007853,0.991695,0.000452
946,0.017978,0.981700,0.000322
947,0.000008,0.000259,0.999733


In [99]:
#!g1.4
result 

Unnamed: 0,+,?,−
0,0.003814,0.995553,0.000633
1,0.000067,0.008170,0.991763
2,0.158110,0.810004,0.031887
3,0.003214,0.081713,0.915073
4,0.998501,0.000986,0.000513
...,...,...,...
944,0.000032,0.000805,0.999163
945,0.007853,0.991695,0.000452
946,0.017978,0.981700,0.000322
947,0.000008,0.000259,0.999733


In [102]:
#!g1.4
result.to_csv('submit.csv')

In [None]:
#!g1.4
