# Bert Partition

## Imports

In [1]:
!pip install transformers[torch] datasets evaluate wandb -q

In [2]:
import datasets
import pickle as pkl

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BertForMaskedLM,
    get_scheduler,
    TrainingArguments,
    Trainer
)
import evaluate
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, SGD

from tqdm.auto import trange, tqdm
import pandas as pd

import wandb

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Parameters

In [4]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mtokubetsu01[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
wandb.init(
    project='grammar-bert-model1',
    entity='grammar-bert',
    group='nika'
)

[34m[1mwandb[0m: Currently logged in as: [33mtokubetsu01[0m ([33mgrammar-bert[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [7]:
SEQ_LEN = 64
BATCH_SIZE = 16
TEST_SIZE = 0.3
MLM_PROB = 0.15

DATA_PATH = '/content/drive/MyDrive/nnlp/bert/biblioteka_prikluchenij_both_agr.csv'
# DATA_PATH = "data/biblioteka_prikluchenij_both_agr.csv"
MODEL_NAME = 'DeepPavlov/rubert-base-cased'

### Data Preparation

In [8]:
# df = pd.read_csv(DATA_PATH)
# df = df.drop(columns=['Unnamed: 0'])

In [9]:
# tqdm.pandas()

# idx_init = df.initial.progress_apply(lambda x: x.replace(' ', ''))
# idx_pol = df.polypers.progress_apply(lambda x: x.replace(' ', ''))
# idx = -(idx_init == idx_pol)
# df['was_changed'] = idx

In [10]:
# df.to_csv(DATA_PATH, index=False)

## Dataset and collator

In [11]:
class PairsDataset(Dataset):
    def __init__(self,
                 tokenizer,
                 path=None,
                 data=None,
                 filter_same=True):
        if data is None:
            data = pd.read_csv(path)
        self.dataset = data[(data.was_changed) | (not filter_same)].reset_index(drop=True)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text1 = self.tokenizer(self.dataset.loc[idx]['initial'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN)

        text2 = self.tokenizer(self.dataset.loc[idx]['polypers'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN)

        return text1, text2

In [12]:
def collate_func(batch):
    batch = [data_collator.torch_call(item) for item in zip(*batch)]
    return batch

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = '[SEP]'
tokenizer.eos_token = '[SEP]'
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=MLM_PROB)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## MLM Train

In [14]:
# dt = datasets.Dataset.from_csv(DATA_PATH)
# dt = dt.remove_columns(['polypers', 'was_changed']).rename_column('initial', 'text')

In [15]:
# N_samples = 10**5

In [16]:
# def tokenize_function(example):
#     return tokenizer(example['text'], truncation=True)

# tok_dt = dt.select(range(N_samples)).map(tokenize_function, batched=True)
# tok_dt = tok_dt.train_test_split(test_size=100,
#                          shuffle=True,
#                          seed=42)

In [17]:
# training_args = TrainingArguments(
#     report_to = 'wandb',
#     output_dir='part1-model',
#     learning_rate=1e-3,
#     per_device_train_batch_size=16,
#     num_train_epochs=1,
#     # evaluation_strategy='steps',
#     # eval_steps=20,
#     logging_steps=20,
#     logging_first_step=True
# )

In [18]:
# model = BertForMaskedLM.from_pretrained(MODEL_NAME)
# model.to(device)
# pass

In [19]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tok_dt['train'],
#     # eval_dataset=tok_dt['test'],
#     tokenizer=tokenizer,
#     data_collator=data_collator
# )

In [20]:
# trainer.train()

## Model

In [21]:
def save_gradients(model, division_layer):
    layers = {}

    for name, param in model.named_parameters():
        if name.startswith(f'bert.encoder.layer.{division_layer}'):
            break
        if (param.requires_grad) and param.grad is not None:
            layers[name] = param.grad.detach()

    return layers

In [22]:
def change_gradients(model, layers, weight_mlm = 0.5, weight_cos = 1):
    for name, param in model.named_parameters():
        if name in layers:
            param.grad = weight_cos * param.grad + weight_mlm * layers[name]
        else:
            break

In [23]:
class CosWeight:
    def __init__(self, init_state=1, step=0.5):
        self.cur_state = init_state
        self.step = step

    def __mul__(self, other):
        res = self.cur_state * other
        self.cur_state = self.cur_state * self.step
        return res

    def __repr__(self):
        return str(self.cur_state)

In [24]:
class CosLoss:
    def __init__(self, vector=None, alpha=0):
        self.loss = nn.CosineEmbeddingLoss()
        self.target = torch.ones(BATCH_SIZE).to(model.device)
        self.alpha = alpha
        self.vector = vector

    def __call__(self, hid_ref, hid_cur, target):
        cos_loss = self.loss(hid_ref, hid_cur, target)
        if self.vector is not None:
            cos_loss += self.alpha * self.loss(self.vector, hid_ref - hid_cur, self.target)
        return cos_loss

In [25]:
def train(model, criteria, optimizer, lr_scheduler, data, n_epochs=1,
          n_cosine=10, division_layer=4, weight_mlm=1,
          weight_cos=1):

    global cosine_losses

    model.train()

    tq_epoch = trange(n_epochs, desc='Epochs: ')
    tq_batch = tqdm(total=len(data))

    target = -torch.ones(BATCH_SIZE).to(model.device)
    grads = None

    for epoch in tq_epoch:
        tq_batch.reset()
        for i, batch in enumerate(data):
            pred = model(**{k: v.to(model.device) for k, v in batch[0].items()},
                         output_hidden_states=True)

            if i % n_cosine == 0:
                pred.loss.backward()
                grads = save_gradients(model, division_layer)
                optimizer.zero_grad()

            else:
                pred_new = model(**{k: v.to(model.device) for k, v in batch[1].items()},
                        output_hidden_states=True)

                hid_ref = torch.mean(pred.hidden_states[division_layer], dim=1)
                hid_cur = torch.mean(pred_new.hidden_states[division_layer], dim=1)

                cos_loss = criteria(hid_ref, hid_cur, target)
                cos_loss.backward()

                change_gradients(model, grads, weight_mlm=weight_mlm,
                                 weight_cos=weight_cos)

                optimizer.step()
                optimizer.zero_grad()

                cosine_losses.append(cos_loss.detach().cpu())

                cos_loss = (sum(cosine_losses[-100:]) / len(cosine_losses[-100:])).item()

                wandb.log({"Cosine loss": cos_loss})
                tq_batch.set_postfix({
                        'Cosine loss': cos_loss
                    })

            tq_batch.update(1)

    model.eval()

In [26]:
dt = PairsDataset(tokenizer, path=DATA_PATH)
dl = DataLoader(dt, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_func)

In [27]:
model = BertForMaskedLM.from_pretrained(MODEL_NAME)
model.to(device)
pass

In [28]:
n_mlm = 2
n_cosine = 8
division_layer = 3
weight_mlm = 1
weight_cos = CosWeight(0.5)

In [29]:
for name, param in model.named_parameters():
    param.requires_grad = name.startswith(f"bert.encoder.layer.{division_layer}")

In [33]:
vec = torch.normal(0.5,
                   0.1,
                   size=(768, ),
                   requires_grad=False).repeat(BATCH_SIZE, 1)

In [34]:
optimizer = SGD(model.parameters(), lr=5e-4)
criterion = CosLoss(vector=vec.to(model.device), alpha=0.5)

num_epochs = 1
num_training_steps = num_epochs * len(dl)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [35]:
cosine_losses = []

train(model, criterion, optimizer, lr_scheduler, dl,
      n_epochs=num_epochs,
      n_cosine=n_cosine, division_layer=division_layer + 1,
      weight_mlm=weight_mlm, weight_cos=weight_cos)

Epochs:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/37379 [00:00<?, ?it/s]

KeyboardInterrupt: 