In [None]:
! pip install razdel



In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
import pandas
import razdel
from torchtext import transforms
from torchdata import datapipes
from torchtext.vocab import build_vocab_from_iterator
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import seaborn as sns
import tensorflow
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import sys
from google.colab import drive
import os
from torch.utils.data import DataLoader
import pickle
from tqdm import tqdm


In [None]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_pipe_ru = datapipes.iter.IterableWrapper(['corpus.en_ru.1m.ru'])
data_pipe_ru = datapipes.iter.FileOpener(data_pipe_ru, mode='r', encoding='utf-8')
data_pipe_ru = data_pipe_ru.readlines()
#data_pipe_ru = data_pipe_ru.parse_csv(skip_lines=0, delimiter='\n')

data_pipe_en = datapipes.iter.IterableWrapper(['corpus.en_ru.1m.en'])
data_pipe_en = datapipes.iter.FileOpener(data_pipe_en, mode='r', encoding='utf-8')
data_pipe_en = data_pipe_en.readlines()
#data_pipe_en = data_pipe_en.parse_csv(skip_lines=0, delimiter='\n')


In [None]:
pairs = []
for data_ru, data_en in zip(data_pipe_ru, data_pipe_en):
  pairs.append((data_ru[1], data_en[1]))

In [None]:
sys.getsizeof(pairs)

8448728

In [None]:
def tokenize_sentence_ru(text):
  tokens = razdel.tokenize(text)
  return [token.text for token in tokens]

def tokenize_sentence_en(text):
  return word_tokenize(text)

def tokenize_data_pipe(data_pipe, tokenizer):
  tokenized_data = []
  for data in data_pipe:
    tokenized_data.append(tokenizer(data[1]))
  return tokenized_data

In [None]:
def yield_tokens(data_iter: datapipes.iter.IterDataPipe, tokenizer):
  for sent in data_iter:
    yield tokenizer(sent[1])

In [None]:
vocab_ru = build_vocab_from_iterator(
    yield_tokens(data_pipe_ru, tokenizer=tokenize_sentence_ru),
    min_freq=1,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
vocab_ru.set_default_index(vocab_ru['<unk>'])

In [None]:
sys.getsizeof(vocab_ru)

48

In [None]:
vocab_en = build_vocab_from_iterator(
    yield_tokens(data_pipe_en, tokenizer=tokenize_sentence_en),
    min_freq=1,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
vocab_en.set_default_index(vocab_en['<unk>'])

In [None]:
print(len(vocab_ru))
print(len(vocab_en))
print(vocab_ru['<sos>'])

775305
458671
1


In [None]:
i2w_ru = vocab_ru.get_itos()
i2w_en = vocab_en.get_itos()

In [None]:
def tokens_to_num(data, vocab, size=10000, max_len=1000):
  tokenized_num = []
  i = 0
  for sent in data:
    if len(sent) <= max_len - 2:
      tokenized_num.append([vocab['<sos>']] + [vocab[word] for word in sent] + [vocab['<eos>']])
      i += 1

    if i == size:
      return tokenized_num

  return tokenized_num


In [None]:
tokens_ru = tokenize_data_pipe(data_pipe_ru, tokenize_sentence_ru)
tokens_en = tokenize_data_pipe(data_pipe_en, tokenize_sentence_en)


In [None]:
sys.getsizeof(tokens_en)

8448728

In [None]:
MAX_LEN = 20

num_tokens_ru = tokens_to_num(tokens_ru, vocab_ru, size=30000, max_len=MAX_LEN)
num_tokens_en = tokens_to_num(tokens_en, vocab_en, size=30000, max_len=MAX_LEN)

num_tokens_ru = pad_sequences(
    num_tokens_ru, padding="post", dtype='float32', value=vocab_ru['<pad>'])
num_tokens_en = pad_sequences(
    num_tokens_en, padding="post", dtype='float32', value=vocab_en['<pad>'])

In [None]:
sys.getsizeof(num_tokens_ru)

2400128

In [None]:
X_train, X_test, y_train, y_test = train_test_split(num_tokens_ru, num_tokens_en, test_size=0.2, random_state=42)

BATCH_SIZE = 64

train_ds = TensorDataset(torch.tensor(X_train).type(torch.long).to(device), torch.tensor(y_train).type(torch.long).to(device))
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

test_ds = TensorDataset(torch.tensor(X_test).type(torch.long).to(device), torch.tensor(y_test).type(torch.long).to(device))
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)

In [None]:

with open('train_dl_30k.pkl', 'wb') as file:
    pickle.dump(train_dl, file)
with open('test_dl_30k.pkl', 'wb') as file:
    pickle.dump(test_dl, file)


**---------------------------------------------------**

In [None]:
with open('train_dl_30k.pkl', 'rb') as file:
    train_dl = pickle.load(file)
with open('test_dl_30k.pkl', 'rb') as file:
    test_dl = pickle.load(file)

In [None]:
VOCAB_RU_LEN = 775305
VOCAB_EN_LEN = 458671
PAD, SOS, EOS = 0, 1, 2
BATCH_SIZE = 64
MAX_LEN = 20

In [None]:
class Encoder(nn.Module):
  def __init__(self, inp_size, hid_size):
    super(Encoder, self).__init__()
    self.hid_size = hid_size
    self.embedding = nn.Embedding(inp_size, hid_size)
    self.rnn = nn.GRU(hid_size, hid_size, batch_first=True)

  def forward(self, x):
    emb = self.embedding(x)
    out, hidden = self.rnn(emb)
    return out, hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LEN):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            _, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze(-1).detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden)

        most_probable_index = torch.argmax(decoder_outputs.view(-1, decoder_outputs.size(-1)), dim=1)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001):
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss(ignore_index=PAD)

    for epoch in tqdm(range(1, n_epochs + 1)):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        print(loss)

In [None]:
hidden_size = 64

encoder = Encoder(VOCAB_RU_LEN, hidden_size).to(device)
decoder = Decoder(hidden_size, VOCAB_EN_LEN).to(device)

train(train_dl, encoder, decoder, 30)


  0%|          | 0/30 [00:00<?, ?it/s]


OutOfMemoryError: ignored

In [None]:
def sentence_to_tensor(sentence, vocab, max_len):
  tokens = [[vocab[word] for word in sentence] + [EOS]]
  tokens = pad_sequences(
    tokens, padding="post", dtype='long', value=vocab['<pad>'], maxlen=max_len)
  return torch.tensor(tokens, device=device)


In [None]:
def evaluate(encoder, decoder, sentence):
    with torch.no_grad():
        input_tensor = sentence_to_tensor(sentence, vocab_ru, max_len=MAX_LEN)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
          if idx.item() == vocab_en['<eos>']:
            decoded_words.append('<eos>')
            break
          decoded_words.append(i2w_en[idx.item()])
    return decoded_words, decoder_attn


In [None]:
print(evaluate(encoder, decoder, pairs[3][0]))
print(evaluate(encoder, decoder, pairs[101][0]))
print(evaluate(encoder, decoder, pairs[200][0]))
print(evaluate(encoder, decoder, pairs[300][0]))


(['<sos>', '<sos>', '.', '<eos>'], None)
(['<sos>', '<sos>', '.', '<eos>'], None)
(['<sos>', '<sos>', '.', '<eos>'], None)
(['<sos>', '<sos>', '.', '<eos>'], None)
