# Data loading and preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

import torch
from torch.utils.data import IterableDataset, Dataset, DataLoader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!pip install sentencepiece transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.3 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 59.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |██████████████████████████████

In [None]:
from transformers import AutoTokenizer

model_name = 'cointegrated/LaBSE-en-ru'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
!pip install gdown
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import gdown

## Download preprocessed data

In [None]:
url = 'https://drive.google.com/uc?id=16tuFXIrSs3rH5k-7kGEVLwUSUltA7I-W'
output = 'ru_data.txt'
gdown.download(url, output, quiet=False)

Access denied with the following error:



 	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=16tuFXIrSs3rH5k-7kGEVLwUSUltA7I-W 



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls gdrive/MyDrive/22spring/diploma

data		      en_ru_translation.ipynb  ru_pretraining.ipynb
en_pretraining.ipynb  files_py		       Untitled0.ipynb


## Dataloader

In [None]:
class DatasetIterable(IterableDataset):
    
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
    
    def __iter__(self):
        return self.generator()

    def generator(self):
        with open(self.file_path, 'r') as file:
            for line in file:
                yield line

In [None]:
class Collator:

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer
    
    def __call__(self, batch):

        ids = self.tokenizer(batch, padding=True, return_tensors='pt').input_ids
        batch = torch.cat(
            (
                ids, 
                torch.full(
                    size=(len(batch), 1),
                    fill_value=self.tokenizer.pad_token_id
                )
            ),
            dim=1
        )
        return batch, batch.clone()

In [None]:
b_size = 2

# file_path = 'ru_data.txt'
file_path = 'gdrive/MyDrive/22spring/diploma/data/ru_data.txt'
ru_dataset = DatasetIterable(file_path)
ru_loader = DataLoader(
    ru_dataset,
    batch_size=b_size,
    collate_fn=Collator(tokenizer)
)

# Models

In [None]:
url = 'https://drive.google.com/uc?id=1wLKOmbDqhUiy4rqEl6Iy8tbhusnJlqor'
output = 'models.py'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1wLKOmbDqhUiy4rqEl6Iy8tbhusnJlqor
To: /content/models.py
100%|██████████| 3.45k/3.45k [00:00<00:00, 1.06MB/s]


'models.py'

In [None]:
from models import Encoder, Decoder, Seq2Seq

# Training

In [None]:
url = 'https://drive.google.com/uc?id=1ZySa1Zu1PpOt9qVugDhAaoVe6FO_VLY0'
output = 'trainers.py'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1DX9zK9G3RCRQC7HWfT3Dq7CxMMsBqy1C
To: /content/trainers.py
100%|██████████| 2.17k/2.17k [00:00<00:00, 5.35MB/s]


'trainers.py'

In [None]:
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR
from trainers import Seq2SeqTrainer

In [None]:
emb_size = 512
hid_size = 512

ru_encoder = Encoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
ru_decoder = Decoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
ru_seq2seq = Seq2Seq(ru_encoder, ru_decoder)

In [None]:
# n_ru_data_sample = sum(1 for _ in open('ru_data.txt', 'r'))
n_ru_data_sample = 23718533

In [None]:
# start_lr * gamma**n_iters = end_lr -> gamma = (end_lr / start_lr)**(1 / n_iter)
scheduler_n_iters = int(1.5 * n_ru_data_sample / ru_loader.batch_size)
start_lr = 5e-4
end_lr = 8e-5
gamma = (end_lr / start_lr)**(1 / scheduler_n_iters)

In [None]:
n_epochs = 5

ru_opt = Adam(ru_seq2seq.parameters(), lr=start_lr)
ru_scheduler = LinearLR(ru_opt, start_factor=gamma, total_iters=scheduler_n_iters)

ru_criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
acc_steps = 128 // ru_loader.batch_size
trainer = Seq2SeqTrainer(ru_seq2seq, ru_scheduler, ru_criterion, device, acc_steps)

In [None]:
trainer.train(ru_loader, n_epochs)