# Data loading and preprocessing

In [1]:
import re

import torch
from torch.utils.data import IterableDataset, Dataset, DataLoader

from tqdm import tqdm

In [2]:
!pip install sentencepiece transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 57.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████

In [3]:
from transformers import AutoTokenizer

model_name = 'cointegrated/LaBSE-en-ru'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
!pip install gdown
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import gdown

In [6]:
!wget -nc https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/datasets/Machine_translation_EN_RU/data.txt

--2022-06-20 12:41:00--  https://raw.githubusercontent.com/girafe-ai/ml-mipt/master/datasets/Machine_translation_EN_RU/data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12905334 (12M) [text/plain]
Saving to: ‘data.txt’


2022-06-20 12:41:00 (83.4 MB/s) - ‘data.txt’ saved [12905334/12905334]



In [7]:
with open('data.txt', 'r') as file:
    data = [list(map(lambda string: string.lower(), line.strip().split('\t'))) for line in file]

data[:3]

[['cordelia hotel is situated in tbilisi, a 3-minute walk away from saint trinity church.',
  'отель cordelia расположен в тбилиси, в 3 минутах ходьбы от свято-троицкого собора.'],
 ['at tupirmarka lodge you will find a 24-hour front desk, room service, and a snack bar.',
  'в числе удобств лоджа tupirmarka круглосуточная стойка регистрации и снэк-бар. гости могут воспользоваться услугой доставки еды и напитков в номер.'],
 ['featuring free wifi in all areas, naigao xiaowo offers accommodation in shanghai.',
  'апартаменты naigao xiaowo расположены в городе шанхай. к услугам гостей бесплатный wi-fi во всех зонах.']]

## DataLoader

In [8]:
class TranslationDataset(Dataset):

    def __init__(self, data):

        self.data = data
    
    def __len__(self):

        return len(self.data)
    
    def __getitem__(self, idx):

        return self.data[idx][0], self.data[idx][1]

In [9]:
class Collator:

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer
    
    def __call__(self, batch):

        en_batch = [pair[0] for pair in batch]
        ru_batch = [pair[1] for pair in batch]

        en_ids = self.tokenizer(en_batch, padding=True, return_tensors='pt').input_ids
        en_batch = torch.cat(
            (
                en_ids, 
                torch.full(
                    size=(len(batch), 1),
                    fill_value=self.tokenizer.pad_token_id
                )
            ),
            dim=1
        )
        ru_ids = self.tokenizer(ru_batch, padding=True, return_tensors='pt').input_ids
        ru_batch = torch.cat(
            (
                ru_ids, 
                torch.full(
                    size=(len(batch), 1),
                    fill_value=self.tokenizer.pad_token_id
                )
            ),
            dim=1
        )
        return en_batch, ru_batch

In [10]:
b_size = 32

translation_train_dataset = TranslationDataset(data)
translation_train_loader = DataLoader(
    translation_train_dataset,
    batch_size = b_size,
    shuffle=True,
    num_workers=2,
    collate_fn=Collator(tokenizer)
)

# Models

In [11]:
url = 'https://drive.google.com/uc?id=1wLKOmbDqhUiy4rqEl6Iy8tbhusnJlqor'
output = 'models.py'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1wLKOmbDqhUiy4rqEl6Iy8tbhusnJlqor
To: /content/models.py
100%|██████████| 3.45k/3.45k [00:00<00:00, 7.45MB/s]


'models.py'

In [12]:
from models import Encoder, Decoder, Seq2Seq, FeedForward1

# Training

In [13]:
url = 'https://drive.google.com/uc?id=1ZySa1Zu1PpOt9qVugDhAaoVe6FO_VLY0'
output = 'trainers.py'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1ZySa1Zu1PpOt9qVugDhAaoVe6FO_VLY0
To: /content/trainers.py
100%|██████████| 5.30k/5.30k [00:00<00:00, 8.88MB/s]


'trainers.py'

In [14]:
import torch.nn as nn
from torch.optim import Adam
from trainers import Seq2SeqTrainer, FFTrainer

In [15]:
emb_size = 512
hid_size = 512

en_encoder = Encoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
ru_encoder = Encoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
ff = FeedForward1(en_encoder.hid_size)

In [21]:
n_epochs = 1

opt = torch.optim.Adam(ff.parameters(), lr=8e-5)
criterion = nn.MSELoss()

In [22]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [23]:
acc_steps = 256 // translation_train_loader.batch_size
ff_trainer = FFTrainer(en_encoder, ru_encoder, ff, opt, criterion, device, acc_steps)

In [None]:
ff_trainer.train(translation_train_loader, n_epochs)