# Data loading and preprocessing

In [1]:
!pip install sentencepiece transformers

[0m

In [2]:
from transformers import AutoTokenizer

model_name = 'cointegrated/LaBSE-en-ru'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [3]:
!pip install gdown

Collecting gdown
  Downloading gdown-4.4.0.tar.gz (14 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-4.4.0-py3-none-any.whl size=14759 sha256=961f9bbbf65dc5afca54db53bea2b7650eeb7eae002baeaa4ce43ff45838bad1
  Stored in directory: /root/.cache/pip/wheels/fb/c3/0e/c4d8ff8bfcb0461afff199471449f642179b74968c15b7a69c
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-4.4.0
[0m

In [4]:
import gdown

## Russian wikipedia

### Dataloading

#### Get data from scratch

In [None]:
!wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2014.tar.bz2
!wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part1.tar.bz2
!wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part2.tar.bz2

In [None]:
!pip install corus

In [None]:
from corus import load_buriy_news


In [None]:
path1 = 'news-articles-2014.tar.bz2'
path2 = 'news-articles-2015-part1.tar.bz2'
path3 = 'news-articles-2015-part2.tar.bz2'

records1 = load_buriy_news(path1)
records2 = load_buriy_news(path2)
records3 = load_buriy_news(path3)

In [None]:
print(next(records1).text)
print('###')
print(next(records2).text)
print('###')
print(next(records3).text)

### Preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

In [None]:
delete_line_sep = re.compile('[\n]+')
change_dash_symb = re.compile('\x97')

In [None]:
data = []

for i, records in enumerate([records1, records2, records3]):

    for rec in tqdm(records):
        text = delete_line_sep.sub('', rec.text)
        text = change_dash_symb.sub('-', text).lower()
        text = sent_tokenize(text)
        data.extend(text)

#### Upload data preprocessed as above

In [39]:
url = 'https://drive.google.com/uc?id=16tuFXIrSs3rH5k-7kGEVLwUSUltA7I-W'
output = 'ru_data.txt'
gdown.download(url, output, quiet=False)

Access denied with the following error:



 	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=16tuFXIrSs3rH5k-7kGEVLwUSUltA7I-W 



### Russian Dataloader

In [10]:
import torch
from torch.utils.data import IterableDataset, Dataset, DataLoader

In [11]:
class RuDatasetIterable(IterableDataset):
    
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
    
    def __iter__(self):
        return self.generator()

    def generator(self):
        with open(self.file_path, 'r') as file:
            for line in file:
                yield line

In [13]:
class RuCollator:

    def __init__(self, tokenizer):

        self.tokenizer = tokenizer
    
    def __call__(self, batch):

        ids = self.tokenizer(batch, padding=True, return_tensors='pt').input_ids
        batch = torch.cat(
            (
                ids, 
                torch.full(
                    size=(len(batch), 1),
                    fill_value=self.tokenizer.pad_token_id
                )
            ),
            dim=1
        )
        return batch, batch.clone()

In [14]:
b_size = 8

ru_dataset = RuDatasetIterable('ru_data.txt')
ru_loader = DataLoader(
    ru_dataset,
    batch_size=b_size,
    collate_fn=RuCollator(tokenizer)
)

## English 

# Models

In [15]:
import torch
import torch.nn as nn

In [16]:
class Encoder(nn.Module):

    def __init__(self, voc_size, emb_size, padding_idx, hid_size):
        super().__init__()

        self.voc_size = voc_size
        self.emb_size = emb_size
        self.padding_idx = padding_idx
        self.hid_size = hid_size

        self.embedding = nn.Embedding(
            num_embeddings=voc_size,
            embedding_dim=emb_size,
            padding_idx=padding_idx
        )
        
        self.rnn = nn.GRU(
            input_size=emb_size,
            hidden_size=hid_size // 2,
            batch_first=True,
            bidirectional=True
        )
    
    def forward(self, batch): # batch: [b_size, seq_len]

        emb = self.embedding(batch) # [b_size, seq_len, emb_size]

        output, _ = self.rnn(emb)
        # output: [b_size, seq_len, hid_size]
        # h: [2, b_size, hid_size // 2]

        return output

In [17]:
class Decoder(nn.Module):

    def __init__(self, voc_size, emb_size, padding_idx, hid_size):
        super().__init__()

        self.voc_size = voc_size
        self.emb_size = emb_size
        self.padding_idx = padding_idx
        self.hid_size = hid_size

        self.embedding = nn.Embedding(
            num_embeddings=voc_size,
            embedding_dim=emb_size,
            padding_idx=padding_idx
        )

        self.rnn = nn.GRU(
            input_size=emb_size,
            hidden_size=hid_size,
            batch_first=True,
            bidirectional=False
        )

        self.to_logits = nn.Linear(hid_size, voc_size)
    
    def forward(self, batch, h): # batch: [b_size, seq_len]

        emb = self.embedding(batch) # [b_size, seq_len, emb_size]

        output, _ = self.rnn(emb) # [b_size, seq_len, hid_size]
        logits = self.to_logits(output) # [b_size, seq_len, voc_size]

        return logits

In [18]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, feed_forward=None):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.feed_forward = self.init_feed_forward(encoder.hid_size) if feed_forward is None else feed_forward
    
    def init_feed_forward(self, hid_size):

        linear = nn.Linear(hid_size, hid_size, bias=False)
        linear.W = torch.eye(hid_size)
        linear.W.requires_grad = False
        
        return linear
    
    def forward(self, src, trg):
        
        # src: [b_size, seq_len]
        # trg: [b_size, seq_len]

        src_output = self.encoder(src) # [b_size, seq_len, hid_size]
        last_hidden_state_idx = (src == self.encoder.padding_idx).float().argmax(dim=-1) - 1
        src_h = src_output[:, last_hidden_state_idx, :] # [b_size, hid_size]

        trg_h = self.feed_forward(src_h) # [b_size, hid_size]

        trg_output = self.decoder(trg, trg_h) # [b_size, seq_len, hid_size]

        return trg_output

# Training

In [20]:
import tqdm
import matplotlib.pyplot as plt
from IPython.display import clear_output
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR

In [29]:
class Trainer:

    def __init__(self, seq2seq, scheduler, criterion, device, acc_steps=1):

        self.seq2seq = seq2seq.to(device)
        self.scheduler = scheduler
        self.criterion = criterion
        self.device = device
        self.acc_steps = acc_steps

        self.train_loss_history = []
        self.val_loss_history = []
    
    def train(self, train_loader, n_epochs, val_loader=None):

        for _ in range(n_epochs):

            self.train_loop(train_loader)

            if val_loader is not None:
                self.val_loader(val_loader)
    
    def train_loop(self, loader):

        self.seq2seq.train()

        acc_iter = 1

        for i, (src, trg) in tqdm.tqdm(enumerate(loader)):

            self.scheduler.optimizer.zero_grad()

            src, trg = src.to(self.device), trg.to(self.device)

            output = self.seq2seq(src, trg)
            loss = self.criterion(output[:, :-1, :].permute(0, 2, 1), trg[:, 1:])
            
            loss.backward()
            
            if acc_iter == self.acc_steps:
                self.scheduler.optimizer.step()
                acc_iter = 0
            
            acc_iter += 1

            self.train_loss_history.append(loss.item() / src.shape[0])

            if (i + 1) % 1 == 0:
                self.plot_loss()
        
    def val_loop(self, loader):

        self.seq2seq.eval()

        with torch.no_grad():

            for i, (src, trg) in enumerate(loader):

                src, trg = src.to(self.device), trg.to(self.device)

                output = self.seq2seq(src, trg)
                loss = self.criterion(output[:, :-1, :].permute(0, 2, 1), trg[:, 1:])

                self.val_loss_history.append(loss.item() / src.shape[0])

                if (i + 1) % 10 == 0:
                    self.plot_loss()

    def plot_loss(self):

        clear_output(wait=True)
        plt.plot(self.train_loss_history)
        plt.plot(self.val_loss_history)
        plt.show()


## Russian2Russian model

In [30]:
emb_size = 512
hid_size = 768

ru_encoder = Encoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
ru_decoder = Decoder(
    voc_size=tokenizer.vocab_size,
    emb_size=emb_size,
    padding_idx=tokenizer.pad_token_id,
    hid_size=hid_size
)
ru_seq2seq = Seq2Seq(ru_encoder, ru_decoder)

In [31]:
# n_ru_data_sample = sum(1 for _ in open('ru_data.txt', 'r'))
n_ru_data_sample = 23718533

In [32]:
# start_lr * gamma**n_iters = end_lr -> gamma = (end_lr / start_lr)**(1 / n_iter)
scheduler_n_iters = int(1.5 * n_ru_data_sample / ru_loader.batch_size)
start_lr = 5e-4
end_lr = 8e-5
gamma = (end_lr / start_lr)**(1 / scheduler_n_iters)

In [33]:
n_epochs = 5

ru_opt = Adam(ru_seq2seq.parameters(), lr=start_lr)
ru_scheduler = LinearLR(ru_opt, start_factor=gamma, total_iters=scheduler_n_iters)

ru_criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [34]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [35]:
acc_steps = 128 // ru_loader.batch_size
trainer = Trainer(ru_seq2seq, ru_scheduler, ru_criterion, device, acc_steps)

In [36]:
trainer.train(ru_loader, n_epochs)

0it [00:00, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'ru_data.txt'