# Bert Partition

## Imports

In [None]:
!pip install transformers datasets evaluate wandb -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m266.2/510.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━

In [None]:
import datasets

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    get_scheduler
)

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from tqdm.auto import trange, tqdm
import pandas as pd

import wandb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Parameters

In [None]:
torch.cuda.empty_cache()

In [None]:
# wandb.login()

In [None]:
# wandb.init(
#     project='grammar-bert-model1',
#     entity='grammar-bert'
# )

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
SEQ_LEN = 64
BATCH_SIZE = 32
TEST_SIZE = 0.3
MLM_PROB = 0.15

DATA_PATH = '/content/drive/MyDrive/UD_converted/Biblioteka_prikluchenij_bpa.csv'
MODEL_NAME = 'DeepPavlov/rubert-base-cased'

## Data Preparation

In [None]:
df = pd.read_csv(DATA_PATH)
data = datasets.Dataset.from_pandas(df)
data = data.train_test_split(test_size=TEST_SIZE)

del df

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'base', 'poly_agr', 'same'],
        num_rows: 1608938
    })
    test: Dataset({
        features: ['Unnamed: 0', 'id', 'base', 'poly_agr', 'same'],
        num_rows: 689545
    })
})

In [None]:
class PairsDataset(Dataset):
    def __init__(self, data, tokenizer, split='train'):
        self.dataset = data[split]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text1 = self.tokenizer(self.dataset[idx]['base'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN)

        text2 = self.tokenizer(self.dataset[idx]['poly_agr'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN)

        return text1, text2

In [None]:
tokenizer.pad_token = '[SEP]'
tokenizer.eos_token = '[SEP]'
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=MLM_PROB)

In [None]:
def collate_func(batch):
    batch = [data_collator.torch_call(item) for item in zip(*batch)]
    return batch

In [None]:
dt = PairsDataset(data, tokenizer)
dl = DataLoader(dt, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_func)

## Model

In [None]:
def save_gradients(model, division_layer):
    layers = {}

    for name, param in model.named_parameters():
        if name.startswith(f'bert.encoder.layer.{division_layer}'):
            break
        if (param.requires_grad) and param.grad is not None:
            layers[name] = param.grad.detach()

    return layers

In [None]:
def change_gradients(model, layers, w_mlm = 0.5, w_cos_diff = 1, w_cos_sim=None, diff_grads=None):
    if w_cos_sim is None:
        for name, param in model.named_parameters():
            if name in layers:
                param.grad = w_cos_diff * param.grad + w_mlm * layers[name]
            else:
                break
    else:
      for name, param in model.named_parameters():
            if name in layers:
                param.grad = w_cos_sim * param.grad + w_mlm * layers[name] + w_cos_diff * diff_grads[name]
            else:
                break

In [None]:
1 not in [3,2]

True

In [None]:
def freeze_layers(model, division_layer):
    for name, param in  model.named_parameters():
        if str(division_layer) not in name:
            param.requires_grad = False


In [None]:
def train(model, criteria, optimizer, lr_scheduler, data, n_epochs=1,
          n_mlm=10, n_cosine=10, division_layer=4, w_mlm=1,
          w_cos_diff=1, w_cos_sim=1, cat_target=None):

    ##TODO: inertia scheduler on w_cos, freeze
    global mlm_losses, cos_diff_losses, cos_sim_losses



    freeze_layers(model, division_layer)
    model.train()
    tq_epoch = trange(n_epochs, desc='Epochs: ')
    tq_batch = tqdm(total=len(data))

    target = torch.ones(BATCH_SIZE).to(model.device)

    for epoch in tq_epoch:
        tq_batch.reset()

        for i, batch in enumerate(data):

            pred = model(**{k: v.to(model.device) for k, v in batch[0].items()},
                         output_hidden_states=True)


            # if i % (n_mlm + n_cosine) < n_mlm:
            #     pred.loss.backward()
            #     optimizer.step()
            #     optimizer.zero_grad()
            #     mlm_losses.append(pred.loss.detach().cpu())

            # else:
            if True:
                pred.loss.backward(retain_graph=True)
                mlm_grads = save_gradients(model, division_layer)
                optimizer.zero_grad()

                pred_new = model(**{k: v.to(model.device) for k, v in batch[1].items()},
                         output_hidden_states=True)

                hid_ref = torch.mean(pred.hidden_states[division_layer], dim=1)
                hid_cur = torch.mean(pred_new.hidden_states[division_layer], dim=1)


                cos_diff_loss = criteria(hid_ref, hid_cur, target*-1)
                print(cos_diff_loss)
                # cos_diff_loss.requires_grad=True
                if cat_target is None:
                    cos_diff_loss.backward()

                    change_gradients(model, mlm_grads, w_mlm=w_mlm,
                                 w_cos_diff=w_cos_diff)

                else:
                    cos_diff_loss.backward(retain_graph=True)
                    diff_grads = save_gradients(model, division_layer)
                    optimizer.zero_grad()

                    cos_sim_loss = criteria(hid_cur, cat_target, target)
                    cos_sim_loss.backward()

                    change_gradients(model, mlm_grads, w_mlm=w_mlm,
                                 w_cos_diff=w_cos_diff, w_cos_sim=w_cos_sim, diff_grads=diff_grads)

                optimizer.step()
                optimizer.zero_grad()

                cos_diff_losses.append(cos_diff_loss.detach().cpu())
                if cat_target is not None:
                    cos_sim_losses.append(cos_sim_loss.detach().cpu())

            mlm_loss = (sum(mlm_losses[-100:]) / max(1,len(mlm_losses[-100:])))
            if len(cos_diff_losses) > 0:
                cos_diff_loss = (sum(cos_diff_losses[-100:]) / max(1,len(cos_diff_losses[-100:])))
            else:
                cos_diff_loss = None

            if cat_target is not None:
                if len(cos_sim_losses) > 0:
                    cos_sim_loss = (sum(cos_sim_losses[-100:]) / max(1,len(cos_sim_losses[-100:])))
                else:
                    cos_sim_loss = None
            else:
                cos_sim_loss = None

            # wandb.log({"MLM loss": mlm_loss, "Cosine loss": cos_loss})
            tq_batch.set_postfix({
                    'MLM loss': mlm_loss,
                    'Cos diff loss': cos_diff_loss,
                    'Cos sim loss': cos_sim_loss
                })

            tq_batch.update(1)

    model.eval()

In [None]:
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.to(device)
pass

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
model.named_parameters().__next__()


('bert.embeddings.word_embeddings.weight',
 Parameter containing:
 tensor([[ 0.0041,  0.0138, -0.0080,  ...,  0.0170,  0.0175,  0.0226],
         [ 0.0010,  0.0184,  0.0111,  ..., -0.0029, -0.0008,  0.0019],
         [ 0.0025,  0.0105,  0.0073,  ..., -0.0028, -0.0003, -0.0066],
         ...,
         [ 0.0035,  0.0057,  0.0148,  ..., -0.0182, -0.0150, -0.0011],
         [-0.0342,  0.0004, -0.0860,  ..., -0.0019, -0.0255, -0.0084],
         [ 0.0178,  0.0073, -0.0294,  ...,  0.0108,  0.0266,  0.0045]],
        device='cuda:0', requires_grad=True))

In [None]:
for i,j in model.named_parameters():
  print('4' in  i)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fa

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CosineEmbeddingLoss()

num_epochs = 1
num_training_steps = num_epochs * len(dl)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0,
    num_training_steps=num_training_steps
)

Вопросы:
- Разные шедулеры и оптимайзер для лоссов
- Замораживать начало модели, когда учим млм?
- Нужно подбирать кол-во шагов на обучении и веса лоссов
- Какой слой? Чем дальше, тем больше будет занимать памяти
- Нужно ревью, было бы круто оптимизировать по памяти

In [None]:
mlm_losses = []
cos_diff_losses = []
cos_sim_losses = []

n_mlm = 10
n_cosine = 10
division_layer = 4
weight_mlm = 1
weight_cos = 0.5

train(model, criterion, optimizer, lr_scheduler, dl, n_mlm=n_mlm,
            n_cosine=n_cosine, division_layer=division_layer,
            w_mlm=weight_mlm,
          w_cos_diff=weight_cos)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50280 [00:00<?, ?it/s]

tensor(0.9545, device='cuda:0')


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn