# Work in progress

In [1]:
!pip install datasets transformers einops sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting responses<0.19
  Downloading res

In [24]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch
from einops import rearrange
from datasets import load_dataset
from transformers import AutoTokenizer, BertTokenizer

tokenizer_pl = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
tokenizer_en = BertTokenizer.from_pretrained('bert-base-cased')

In [25]:
class TextDataset(Dataset):
    def __init__(self, split, transform=None, target_transform=None) -> None:
        super().__init__()
        
        dataset = load_dataset("opus_euconst", 'en-pl')

        if split == 'train':
            self.text_en = [tokenizer_en(text['en'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['input_ids'] for text in dataset['train']['translation'][:9000]]
            self.text_pl = [tokenizer_pl(text['pl'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['input_ids'] for text in dataset['train']['translation'][:9000]]

            self.mask_en = [tokenizer_en(text['en'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['attention_mask'] for text in dataset['train']['translation'][:9000]]
            self.mask_pl = [tokenizer_pl(text['pl'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['attention_mask'] for text in dataset['train']['translation'][:9000]]
            
        elif split == 'validation':
            self.text_en = [tokenizer_en(text['en'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['input_ids'] for text in dataset['train']['translation'][9000:]]
            self.text_pl = [tokenizer_pl(text['pl'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['input_ids'] for text in dataset['train']['translation'][9000:]]

            self.mask_en = [tokenizer_en(text['en'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['attention_mask'] for text in dataset['train']['translation'][9000:]]
            self.mask_pl = [tokenizer_pl(text['pl'], padding='max_length', return_tensors='pt', truncation=True, max_length=50)['attention_mask'] for text in dataset['train']['translation'][9000:]]
        else:
            raise ValueError

        del dataset

    def __len__(self):
        return len(self.text_en)

    def __getitem__(self, idx):
        return self.text_en[idx], self.text_pl[idx], self.mask_en[idx], self.mask_pl[idx]

In [4]:
train_ds = TextDataset('train')
valid_ds = TextDataset('validation')

Downloading builder script:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Downloading and preparing dataset opus_euconst/en-pl to /root/.cache/huggingface/datasets/opus_euconst/en-pl/1.0.0/d1e611a011f28fdda67a97024820e0a3813b4e4decca194d9a20b3207a39b908...


Downloading data:   0%|          | 0.00/480k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9938 [00:00<?, ? examples/s]

Dataset opus_euconst downloaded and prepared to /root/.cache/huggingface/datasets/opus_euconst/en-pl/1.0.0/d1e611a011f28fdda67a97024820e0a3813b4e4decca194d9a20b3207a39b908. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

In [51]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, emb_dim, output_dim, block_size) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.output_dim = output_dim

        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim)
        self.output = nn.Linear(emb_dim, output_dim)
        pos_emb = self._create_pos_emb(block_size, emb_dim)
        self.register_buffer('pos_emb', pos_emb)

    @staticmethod
    def _create_pos_emb(block_size, hidden_size):
        pos_enc = []
        for pos in range(block_size):
            for i in range(hidden_size):
                if i % 2 == 0:
                    pos_enc.append(torch.sin(torch.tensor(pos / (10000 ** (2 * i / hidden_size )))))
                else:
                    pos_enc.append(torch.cos(torch.tensor(pos / (10000 ** (2 * i / hidden_size )))))
        pos_enc = torch.tensor(pos_enc).reshape(block_size, hidden_size)
        return pos_enc
    
    def forward(self, x):
        # B T
        x = self.emb(x)
        x = x + self.pos_emb
        # B T C
        x = self.output(x)
        x = x.squeeze()
        x = rearrange(x, 'B T C -> (B T) C')
        x = torch.softmax(x, dim=1)
        return x

In [52]:
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_ds, batch_size=128, shuffle=False, drop_last=True)

In [53]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [54]:
@torch.no_grad()
def estimate_loss(train_loader, valid_loader, loss_func):
    model.eval()

    train_loss = 0
    for x, target, mask1, mask2 in train_loader:
        x, target, mask1, mask2 = x.to(device), target.to(device), mask1.to(device), mask2.to(device)
        preds = model(x)
        train_loss += loss_func(preds, target.ravel())
    train_loss /= len(train_loader)

    valid_loss = 0
    for x, target, mask1, mask2 in valid_loader:
        x, target, mask1, mask2 = x.to(device), target.to(device), mask1.to(device), mask2.to(device)
        preds = model(x)
        valid_loss += loss_func(preds, target.ravel())
    valid_loss /= len(valid_loader)

    model.train()
    return train_loss, valid_loss

In [55]:
def get_learning_rate(d_model, step_num, warmup_steps=4000):
    return d_model ** (-0.5) * min(step_num ** (-0.5), step_num * warmup_steps ** (-1.5))

In [56]:
d_model = 16
model = Transformer(tokenizer_en.vocab_size, d_model, tokenizer_pl.vocab_size, 50)
model.to(device)

Transformer(
  (emb): Embedding(28996, 16)
  (output): Linear(in_features=16, out_features=50000, bias=True)
)

In [57]:
%%time
loss_func = nn.CrossEntropyLoss(label_smoothing=0.1)

torch.manual_seed(42)
losses = []
for i in range(2):

    lr = get_learning_rate(d_model, i + 1)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=[.9, .98], eps=1e-09)

    for x, target, mask1, mask2 in train_loader:
        x, target, mask1, mask2 = x.to(device), target.to(device), mask1.to(device), mask2.to(device)
        target = target.squeeze().flatten()
        optimizer.zero_grad(set_to_none=True)
        preds = model(x)
        loss = loss_func(preds, target)
        loss.backward()
        optimizer.step()
    train_loss, valid_loss = estimate_loss(train_loader, valid_loader, loss_func)
    print(f"epoch {str(i + 1).zfill(2)}: train loss {train_loss:.6f}, val loss {valid_loss:.6f}")

epoch 01: train loss 10.819779, val loss 10.819777
epoch 02: train loss 10.819779, val loss 10.819776
CPU times: user 30.7 s, sys: 81.4 ms, total: 30.8 s
Wall time: 33.4 s
