# Bert Model (almost)

### Imports

In [None]:
!pip install datasets transformers sentencepiece -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import time
from itertools import chain

from tqdm.notebook import tqdm, trange
import numpy as np
from matplotlib import pyplot as plt

from datasets import load_dataset, load_from_disk
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from transformers import BertTokenizerFast

from gensim.models import Word2Vec, KeyedVectors
import gensim

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
# from transformers import DataCollatorForLanguageModeling

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
os.chdir('/content/drive/MyDrive/nnlp/bert/data')

In [None]:
!pwd

/content/drive/MyDrive/nnlp/bert/data


### Functions

In [None]:
def form_emb_tesor(w2v):
    res = []
    for idx in tqdm(range(max(w2v.key_to_index.keys()) + 1)):
        res.append(w2v[idx])
    res = torch.from_numpy(np.vstack(res))
    return res

### Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, path='dataset',
                 line=None, bsize=10, pad=256):

        self.__dataset = None
        self.__text_encodings = None
        self.__pad = pad
        self.__bsize = bsize
        self.__tokenizer = tokenizer

        self.__read_data(line, path)
        self.__prepare_dataset()

    def __read_data(self, line, path):
        if path not in os.listdir('.'):
            dataset = load_dataset('wikimedia/wikipedia', '20231101.ru')
            dataset['train'].save_to_disk(path)
        else:
            dataset = load_from_disk(path)

        if line is not None:
            dataset = dataset.select(range(0, line))

        self.__dataset = dataset

    def __batch_iter(self):
        for i in range(0, len(self.__dataset), self.__bsize):
            cur = self.__dataset[i:i + self.__bsize]['text']
            cur = self.__tokenizer(cur, padding='max_length',
                      max_length=self.__pad + 1).data['input_ids']
            yield cur

    def __prepare_dataset(self):
        self.__text_encodings = []
        for item in self.__batch_iter():
            self.__text_encodings.extend(list(chain.from_iterable(item)))

    def __len__(self):
        return len(self.__text_encodings) - self.__pad

    def __getitem__(self, idx):
        return (
            torch.tensor(self.__text_encodings[idx:idx + self.__pad]),
            torch.tensor(self.__text_encodings[idx + self.__pad + 1])
        )

### Model

In [None]:
class GrammarModel(nn.Module):
    def __init__(self,
                 input_size,
                 layers=2,
                 lin_hidden=256):

        super(GrammarModel, self).__init__()

        self.lstm = nn.LSTM(input_size, lin_hidden,
                            layers, batch_first=True)

    def forward(self, inp):
        return self.lstm(inp)

In [None]:
class BaseEncoderModel(nn.Module):
    def __init__(self,
                 linear_size,
                 output_size,
                 embeddings,
                 input_size=300,
                 lin_hidden=512,
                 dim_feed=2048):

        super(BaseEncoderModel, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(embeddings)

        encoder_layer = nn.TransformerEncoderLayer(d_model=input_size,
            dim_feedforward=dim_feed, nhead=2, batch_first=True)

        self.encoder_first = nn.TransformerEncoder(encoder_layer, num_layers=2)

        encoder_layer = nn.TransformerEncoderLayer(d_model=input_size,
            dim_feedforward=dim_feed, nhead=1, batch_first=True)

        self.encoder_second = nn.TransformerEncoder(encoder_layer, num_layers=1)

        self.lm_head = nn.Sequential(
            nn.Flatten(start_dim=1),

            nn.Linear(linear_size, lin_hidden),
            nn.ReLU(),

            nn.Linear(lin_hidden, output_size),
            nn.ReLU()
        )

    def forward(self, inp):
        emb = self.embedding(inp)
        tr_emb1 = self.encoder_first(emb)
        tr_emb2 = self.encoder_second(tr_emb1)
        out = self.lm_head(tr_emb2)
        return out

In [None]:
class EncoderModel(BaseEncoderModel):
    def __init__(self,
                 linear_size,
                 output_size,
                 embeddings,
                 input_size=300,
                 lin_hidden=512,
                 dim_feed=2048):

        super(EncoderModel, self).__init__(linear_size,
                 output_size,
                 embeddings,
                 input_size=300,
                 lin_hidden=512,
                 dim_feed=2048)

        self.grammar_part = GrammarModel(input_size, input_size)

    def forward(self, inp):
        emb = self.embedding(inp)
        tr_emb1 = self.encoder_first(emb)
        gram_emb = self.grammar_part(tr_emb1)
        tr_emb2 = self.encoder_second(tr_emb1 + gram_emb)
        out = self.lm_head(tr_emb2)
        return out

### Tran loop

In [None]:
def train(model, optimizer, loss_func, data, n_epochs=1):
    global losses

    model.to(device)
    model.train()

    for epoch in trange(n_epochs):
        tq = tqdm(leave=False, total=len(data))

        for seq, target in data:
            seq = seq.to(device)
            target = target.to(device)

            pred = model(seq)
            # pred_flat = pred.view(-1, ntokens)
            loss = criterion(pred, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            losses.append(loss.detach().cpu())

            tq.set_postfix({'Loss': (sum(losses[-100:]) / len(losses[-100:])).item()})
            tq.update(1)

        plt.plot(losses)
        plt.yscale('log')
        plt.show()

    model.eval()

### Training

In [None]:
batch_size = 256
seq_len = 35

tok = BertTokenizerFast.from_pretrained('small-bert-tokenizer')
dt = CustomDataset(tok, line=10000, pad=seq_len)
dl = DataLoader(dt, batch_size=batch_size, shuffle=True)

In [None]:
w2v = KeyedVectors.load("word2vec.wordvectors", mmap='r')
embs = form_emb_tesor(w2v)
embs.shape

  0%|          | 0/25000 [00:00<?, ?it/s]

torch.Size([25000, 300])

In [None]:
ntokens = len(tok.vocab)
emsize = 300
d_hid = 128
model = BaseEncoderModel(emsize * seq_len,
                     ntokens,
                     embs,
                     input_size=emsize,
                     lin_hidden=256,
                     dim_feed=d_hid)



In [None]:
# model = EncoderModel(emsize * seq_len,
#                      ntokens,
#                      embs,
#                      input_size=emsize,
#                      lin_hidden=256,
#                      dim_feed=d_hid)

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 0.5  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
losses = []
train(model, optimizer, criterion, dl, n_epochs=1)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/132923 [00:00<?, ?it/s]

KeyboardInterrupt: 