# Data loading and preprocessing

In [1]:
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

from tqdm import tqdm

import torch
from torch.utils.data import IterableDataset, Dataset, DataLoader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
!pip install sentencepiece transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.3 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 62.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |██████████████████████████████

In [3]:
from transformers import AutoTokenizer

model_name = 'cointegrated/LaBSE-en-ru'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
!pip install gdown
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import gdown

## Get data from scratch

In [6]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 7.9 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 66.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 69.4 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 56.0 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |█████████████████████████

In [7]:
from datasets import load_dataset

In [10]:
cc_news = load_dataset('cc_news', split="train")

Downloading builder script:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/932 [00:00<?, ?B/s]

Downloading and preparing dataset cc_news/plain_text (download: 805.98 MiB, generated: 1.88 GiB, post-processed: Unknown size, total: 2.67 GiB) to /root/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6...


Downloading data:   0%|          | 0.00/845M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/708241 [00:00<?, ? examples/s]

Dataset cc_news downloaded and prepared to /root/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6. Subsequent calls will reuse this data.


In [19]:
print(len(cc_news))

708241


In [18]:
print(cc_news[0]['text'])
print('###')
print(cc_news[10]['text'])

There's a surprising twist to Regina Willoughby's last season with Columbia City Ballet: It's also her 18-year-old daughter Melina's first season with the company. Regina, 40, will retire from the stage in March, just as her daughter starts her own career as a trainee. But for this one season, they're sharing the stage together.
Performing Side-By-Side In The Nutcracker
Regina and Melina are not only dancing in the same Nutcracker this month, they're onstage at the same time: Regina is doing Snow Queen, while Melina is in the snow corps, and they're both in the Arabian divertissement. "It's very surreal to be dancing it together," says Regina. "I don't know that I ever thought Melina would take ballet this far."
Left: Regina and Melina with another company member post-snow scene in 2003. Right: The pair post-snow scene in 2017 (in the same theater)
Keep reading at dancemagazine.com.
###
Though American Ballet Theatre principals James Whiteside and Isabella Boylston have long displayed 

### Preprocessing

In [22]:
delete_line_sep = re.compile('[\n]+')

In [28]:
data = []

for rec in tqdm(cc_news):

    text = delete_line_sep.sub('', rec['text']).lower()
    text = sent_tokenize(text)

    data.extend(text)

100%|██████████| 708241/708241 [07:44<00:00, 1523.84it/s]


## Upload data preprocessed as above

In [7]:
url = 'https://drive.google.com/uc?id=1Imd9w580FKGAY4_SsJbpUvoLh_lsoHrc'
output = 'en_data.txt'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1Imd9w580FKGAY4_SsJbpUvoLh_lsoHrc
To: /content/en_data.txt
100%|██████████| 1.70G/1.70G [00:06<00:00, 267MB/s]


'en_data.txt'

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
!ls gdrive/MyDrive/22spring/diploma

data		      en_ru_translation.ipynb  ru_pretraining.ipynb
en_pretraining.ipynb  files_py		       Untitled0.ipynb


## Dataloader

In [10]:
class DatasetIterable(IterableDataset):
    
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
    
    def __iter__(self):
        return self.generator()

    def generator(self):
        with open(self.file_path, 'r') as file:
            for line in file:
                yield line

In [11]:
class Collator:

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer
    
    def __call__(self, batch):

        ids = self.tokenizer(batch, padding=True, return_tensors='pt').input_ids
        batch = torch.cat(
            (
                ids, 
                torch.full(
                    size=(len(batch), 1),
                    fill_value=self.tokenizer.pad_token_id
                )
            ),
            dim=1
        )
        return batch, batch.clone()

In [25]:
b_size = 8

file_path = 'en_data.txt'
file_path = 'gdrive/MyDrive/22spring/diploma/data/en_data.txt'
en_dataset = DatasetIterable(file_path)
en_loader = DataLoader(
    en_dataset,
    batch_size=b_size,
    collate_fn=Collator(tokenizer)
)

# Models

In [26]:
url = 'https://drive.google.com/uc?id=1wLKOmbDqhUiy4rqEl6Iy8tbhusnJlqor'
output = 'models.py'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1wLKOmbDqhUiy4rqEl6Iy8tbhusnJlqor
To: /content/models.py
100%|██████████| 3.45k/3.45k [00:00<00:00, 7.06MB/s]


'models.py'

In [27]:
from models import Encoder, Decoder, Seq2Seq

# Training

In [28]:
url = 'https://drive.google.com/uc?id=1ZySa1Zu1PpOt9qVugDhAaoVe6FO_VLY0'
output = 'trainers.py'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1DX9zK9G3RCRQC7HWfT3Dq7CxMMsBqy1C
To: /content/trainers.py
100%|██████████| 2.17k/2.17k [00:00<00:00, 920kB/s]


'trainers.py'

In [29]:
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR
from trainers import Seq2SeqTrainer

In [30]:
emb_size = 512
hid_size = 512

en_encoder = Encoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
en_decoder = Decoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
en_seq2seq = Seq2Seq(en_encoder, en_decoder)

In [31]:
# n_en_data_sample = sum(1 for _ in open('en_data.txt', 'r'))
n_en_data_sample = 7614863

In [32]:
# start_lr * gamma**n_iters = end_lr -> gamma = (end_lr / start_lr)**(1 / n_iter)
scheduler_n_iters = int(1.5 * n_en_data_sample / en_loader.batch_size)
start_lr = 5e-4
end_lr = 8e-5
gamma = (end_lr / start_lr)**(1 / scheduler_n_iters)

In [33]:
n_epochs = 5

en_opt = Adam(en_seq2seq.parameters(), lr=start_lr)
en_scheduler = LinearLR(en_opt, start_factor=gamma, total_iters=scheduler_n_iters)

en_criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [34]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [35]:
acc_steps = 32 // en_loader.batch_size
trainer = Seq2SeqTrainer(en_seq2seq, en_scheduler, en_criterion, device, acc_steps)

In [None]:
trainer.train(en_loader, n_epochs)