# Bert Partition

## Imports

In [1]:
!pip install transformers datasets evaluate wandb -q

In [2]:
import datasets

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    get_scheduler
)

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from tqdm.auto import trange, tqdm
import pandas as pd

import wandb

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Parameters

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mtokubetsu01[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
wandb.init(
    project='grammar-bert-model1',
    entity='grammar-bert'
)

[34m[1mwandb[0m: Currently logged in as: [33mtokubetsu01[0m ([33mgrammar-bert[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [7]:
SEQ_LEN = 64
BATCH_SIZE = 32
TEST_SIZE = 0.3
MLM_PROB = 0.15

DATA_PATH = '/content/drive/MyDrive/nnlp/bert/biblioteka_prikluchenij_both_agr.csv'
MODEL_NAME = 'DeepPavlov/rubert-base-cased'

## Data Preparation

In [8]:
df = pd.read_csv(DATA_PATH)
data = datasets.Dataset.from_pandas(df)
data = data.train_test_split(test_size=TEST_SIZE)

del df

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
class PairsDataset(Dataset):
    def __init__(self, data, tokenizer, split='train'):
        self.dataset = data[split]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text1 = self.tokenizer(self.dataset[idx]['initial'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN)

        text2 = self.tokenizer(self.dataset[idx]['polypers'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN)

        return text1, text2

In [11]:
tokenizer.pad_token = '[SEP]'
tokenizer.eos_token = '[SEP]'
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=MLM_PROB)

In [12]:
def collate_func(batch):
    batch = [data_collator.torch_call(item) for item in zip(*batch)]
    return batch

In [13]:
dt = PairsDataset(data, tokenizer)
dl = DataLoader(dt, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_func)

## Model

In [14]:
def save_gradients(model, division_layer):
    layers = {}

    for name, param in model.named_parameters():
        if name.startswith(f'bert.encoder.layer.{division_layer}'):
            break
        if (param.requires_grad) and param.grad is not None:
            layers[name] = param.grad.detach()

    return layers

In [15]:
def change_gradients(model, layers, weight_mlm = 0.5, weight_cos = 1):
    for name, param in model.named_parameters():
        if name in layers:
            param.grad = weight_cos * param.grad + weight_mlm * layers[name]
        else:
            break

In [16]:
def train(model, criteria, optimizer, lr_scheduler, data, n_epochs=1,
          n_mlm=10, n_cosine=10, division_layer=4, weight_mlm=0.5,
          weight_cos=1):

    global mlm_losses, cosine_losses

    model.train()

    tq_epoch = trange(n_epochs, desc='Epochs: ')
    tq_batch = tqdm(total=len(data))

    target = torch.ones(BATCH_SIZE).to(model.device)

    for epoch in tq_epoch:
        tq_batch.reset()

        for i, batch in enumerate(data):

            pred = model(**{k: v.to(model.device) for k, v in batch[0].items()},
                         output_hidden_states=True)


            if i % (n_mlm + n_cosine) < n_mlm:
                pred.loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                mlm_losses.append(pred.loss.detach().cpu())

            else:
                pred.loss.backward(retain_graph=True)
                grads = save_gradients(model, division_layer)
                optimizer.zero_grad()

                pred_new = model(**{k: v.to(model.device) for k, v in batch[1].items()},
                         output_hidden_states=True)

                hid_ref = torch.mean(pred.hidden_states[division_layer], dim=1)
                hid_cur = torch.mean(pred_new.hidden_states[division_layer], dim=1)

                cos_loss = criteria(hid_ref, hid_cur, target)
                cos_loss.backward()

                change_gradients(model, grads, weight_mlm=weight_mlm,
                                 weight_cos=weight_cos)

                optimizer.step()
                optimizer.zero_grad()

                cosine_losses.append(cos_loss.detach().cpu())

            mlm_loss = (sum(mlm_losses[-100:]) / len(mlm_losses[-100:])).item()
            if len(cosine_losses) > 0:
                cos_loss = (sum(cosine_losses[-100:]) / len(cosine_losses[-100:])).item()
            else:
                cos_loss = None

            wandb.log({"MLM loss": mlm_loss, "Cosine loss": cos_loss})
            tq_batch.set_postfix({
                    'MLM loss': mlm_loss,
                    'Cosine loss': cos_loss
                })

            tq_batch.update(1)

    model.eval()

In [17]:
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.to(device)
pass

In [18]:
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CosineEmbeddingLoss()

num_epochs = 1
num_training_steps = num_epochs * len(dl)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0,
    num_training_steps=num_training_steps
)

Вопросы:
- Разные шедулеры и оптимайзер для лоссов
- Замораживать начало модели, когда учим млм?
- Нужно подбирать кол-во шагов на обучении и веса лоссов
- Какой слой? Чем дальше, тем больше будет занимать памяти
- Нужно ревью, было бы круто оптимизировать по памяти

In [19]:
mlm_losses = []
cosine_losses = []

n_mlm = 10
n_cosine = 10
division_layer = 4
weight_mlm = 1
weight_cos = 0.5

train(model, criterion, optimizer, lr_scheduler, dl, n_mlm=n_mlm,
             n_cosine=n_cosine, division_layer=division_layer,
             weight_mlm=weight_mlm, weight_cos=weight_cos)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50280 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 