# Bert Analytics

## Imports

In [1]:
!pip install transformers datasets evaluate wandb umap-learn hdbscan minicons seaborn

In [None]:
import datasets
import transformers

from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
)

import torch
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import trange, tqdm
import pandas as pd
import numpy as np
import minicons.scorer
from mlm_scoring import ppl_mlm_score
from functools import partial, partialmethod
import pickle


import umap
from matplotlib import pyplot as plt
import hdbscan
import seaborn as sns

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path('.').absolute().parent))

from MinioHandler import MinioHandler

In [3]:
minio = MinioHandler()

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

## Parameters

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [5]:
SEQ_LEN = 64
BATCH_SIZE = 32

DATA_PATH = '../data/test_dataset.csv'
MODEL_NAME = 'DeepPavlov/rubert-base-cased'
WEIGHTS_PATH = "ckpt/pretrained_bert/model_epoch_10.pt"

## Data Preparation

In [6]:
df = pd.read_csv(DATA_PATH, index_col=0)
df.drop(columns=["was_changed"], inplace=True)
data = datasets.Dataset.from_pandas(df, preserve_index=False)

# data = data.train_test_split(test_size=TEST_SIZE)

In [7]:
data

Dataset({
    features: ['base', 'polypers'],
    num_rows: 229849
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [9]:
class PairsDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.dataset = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text1 = self.tokenizer(self.dataset[idx]['base'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN,
                     return_tensors='pt')

        text2 = self.tokenizer(self.dataset[idx]['polypers'],
                     padding='max_length',
                     truncation=True,
                     max_length=SEQ_LEN,
                     return_tensors='pt')

        return text1, text2

In [None]:
# tokenizer.pad_token = '[SEP]'
# tokenizer.eos_token = '[SEP]'
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=MLM_PROB)

## Model

In [None]:
def vectorize(model, data, metric, division_layer=3):
    model.eval()

    res = []

    with torch.no_grad():
        for batch in tqdm(data):
            ref = model(**{k: v.to(model.device).squeeze() for k, v in batch[0].items()},
                         output_hidden_states=True)
            cur = model(**{k: v.to(model.device).squeeze() for k, v in batch[1].items()},
                         output_hidden_states=True)

            ref = torch.mean(ref.hidden_states[division_layer], dim=1)
            cur = torch.mean(cur.hidden_states[division_layer], dim=1)

            dist = metric(ref, cur)

            res.append(dist)

    return torch.vstack(res)

In [None]:
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
model.to(device)
pass

In [None]:
dt = PairsDataset(data.select(range(10000)), mlm_model.tokenizer)
dl = DataLoader(dt,
                batch_size=BATCH_SIZE,
                shuffle=False)

### MLM scoring

In [10]:
BATCH_SIZE = 3
# note : this is the sequence length in terms of words, 
SEQ_LEN = 64

In [8]:
# not needed anymore
# data_cut = data.map(lambda x: {k: " ".join(v.split()[:SEQ_LEN]) for k, v in x.items()})

In [11]:
token_dl = DataLoader(data,
                batch_size=BATCH_SIZE,
                shuffle=False)

In [12]:
mlm_model = minicons.scorer.MaskedLMScorer(
    MODEL_NAME, 'cuda', torch_dtype=torch.float32, )

  return self.fget.__get__(instance, owner)()


In [13]:
type(mlm_model.tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

In [14]:
def override_padding(func):
    def wrapper(*args, **kwargs):
        try:
            del kwargs["max_length"]
            del kwargs["truncation"]
        except:
            pass
        return func(*args, 
                max_length=SEQ_LEN+2,
                truncation=True,
                **kwargs)
    return wrapper

In [15]:
# this step is crusial as otherwise kernel crashes with OOM in case long sentence is passed

SEQ_LEN_tokenizer = type('SEQ_LEN_tokenizer', (type(mlm_model.tokenizer), ), 
                         {"__call__": override_padding(type(mlm_model.tokenizer).__call__,),
                         "encode": override_padding(type(mlm_model.tokenizer).encode),
                         "batch_encode_plus": override_padding(type(mlm_model.tokenizer).batch_encode_plus),
                         })

mlm_model.tokenizer.__class__ = SEQ_LEN_tokenizer

# mlm_model.tokenizer.encode = types.MethodType(partial(mlm_model.tokenizer.encode, max_length=SEQ_LEN), mlm_model.tokenizer, type(mlm_model.tokenizer))
# mlm_model.tokenizer.batch_encode_plus = types.MethodType(partial(mlm_model.tokenizer.batch_encode_plus, max_length=SEQ_LEN), mlm_model.tokenizer, type(mlm_model.tokenizer))

In [16]:
ckpt = minio.get_object(WEIGHTS_PATH, type="model")
model_dict = torch.load(ckpt)
mlm_model.model.load_state_dict(model_dict["model_state_dict"])
mlm_model.model.eval();

#### Examples

In [None]:
stimuli = ['Маша переигрываетют команду галактических велоцирапторов на их же бурлящем магмой поле.']

In [None]:
# un-normalized sequence score
print(mlm_model.sequence_score(stimuli, reduction = lambda x: -x.sum(0).item(), PLL_metric='within_word_l2r'))

In [None]:
# un-normalized sequence score
print(mlm_model.sequence_score(stimuli,  PLL_metric='within_word_l2r'))

In [None]:
# original metric, for comparison:
print(mlm_model.sequence_score(stimuli, reduction = lambda x: -x.sum(0).item(), PLL_metric='original'))

In [None]:
# per token mlm score
print(mlm_model.token_score(stimuli, PLL_metric='within_word_l2r'))

In [None]:
# original values, for comparison (notice the 'souvenir' tokens):
print(mlm_model.token_score(stimuli, PLL_metric='original'))

Test dataset scoring

In [None]:
list(
    zip(
    data["base"][:20], 
         mlm_model.sequence_score(data["base"][:20], reduction = lambda x: -x.sum(0).item(), PLL_metric='within_word_l2r')
    )
)

In [None]:
list(
    zip(
    data["polypers"][:20], 
         mlm_model.sequence_score(data["polypers"][:20], reduction = lambda x: -x.sum(0).item(), PLL_metric='within_word_l2r')
    )
)

#### Tests

In [15]:
# def print_tokens(func):
#     def wrapper(*args, **kwargs):
#         print([item.shape for item in list(args[0])[0]])
#         print()
#         print(kwargs)
#         return func(*args, **kwargs)
#     return wrapper

# mlm_model.compute_stats = print_tokens(mlm_model.compute_stats)

In [16]:
# len(mlm_model.tokenizer.batch_encode_plus([data["base"][170621]])["input_ids"][0])

300

In [17]:
# mlm_model.tokenizer.batch_encode_plus([data["base"][170621]],
#                     add_special_tokens=False,
#                     padding="longest",
#                     return_attention_mask=True,
#                     return_tensors="pt",
#                 )["input_ids"].shape

torch.Size([1, 298])

#### Score test dataset

In [None]:
base_scores, poly_scores, tkn_dict_base, tkn_dict_poly = ppl_mlm_score(mlm_model, token_dl, top_k=5)

  1%|          | 918/76617 [08:43<7:24:18,  2.84it/s] 

In [None]:
with open(f"./logs/model_epoch_10.pt", "wb") as f:
    pickle.dump(

Метрики:
- Норма вектора разницы
- Евклид
-

In [None]:
metric = lambda x, y: abs(x - y)

In [None]:
vecs = vectorize(model, dl, metric, division_layer=4)
vecs.shape

In [None]:
sns.histplot(torch.linalg.vector_norm(vecs, ord=2, dim=1).cpu(), bins=10)

In [None]:
torch.where(vecs.sum(dim=1) != 0)[0].shape

In [None]:
reducer = umap.UMAP(n_neighbors=15,
                    min_dist=0.2,
                    n_components=2,
                    metric='euclidean',
                    random_state=40)

In [None]:
embedding = reducer.fit_transform(vecs.cpu().numpy())

In [None]:
plt.scatter(embedding[:, 0], embedding[:, 1])

In [None]:
vecs_diff = vecs[torch.where(vecs.sum(dim=1) != 0)[0]].cpu()
mn_vec = vecs_diff.mean(dim=0).cpu()

In [None]:
cos = torch.nn.functional.cosine_similarity(vecs_diff, mn_vec.view(1, -1), dim=1)
norm = torch.linalg.vector_norm(vecs_diff, ord=2, dim=1).cpu()

In [None]:
plt.scatter(cos, norm)

In [None]:
clusterer = hdbscan.HDBSCAN()

In [None]:
clusterer.fit(blobs)