In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.data import Field, BucketIterator

import spacy
from tqdm.notebook import tqdm
import tqdm
import random
import math
import time
import numpy as np

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer

In [None]:
SEED = 666

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!ls '/content/drive/MyDrive/Colab Notebooks/NLP'

best_model.pt  europarl-v7.de-en.de  GRU_model2.pt	 new_data.txt
de-en.tgz      europarl-v7.de-en.en  GRU_model.pt	 rus-eng.zip
de-en.zip      fra-eng.zip	     new_data_small.txt


In [None]:
tokenizer_W = WordPunctTokenizer()

def tokenize_ru(x, tokenizer=WordPunctTokenizer()):
    return tokenizer.tokenize(x.lower())

def tokenize_en(x, tokenizer=WordPunctTokenizer()):
    return tokenizer.tokenize(x.lower())

In [None]:
SRC = Field(tokenize=tokenize_en,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize=tokenize_ru,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)


dataset = torchtext.legacy.data.TabularDataset(
    path='data.txt',
    format='tsv',
    fields=[('src', SRC), ('trg', TRG)]
)

In [None]:
print(len(dataset.examples))
print(dataset.examples[0].src)
print(dataset.examples[0].trg)

291329
['wiederaufnahme', 'der', 'sitzungsperiode']
['resumption', 'of', 'the', 'session']


In [None]:
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 233063
Number of validation examples: 14567
Number of testing examples: 43699


In [None]:
SRC.build_vocab(dataset, min_freq=2)
TRG.build_vocab(dataset, min_freq=2)

In [None]:
print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (ru) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 22712
Unique tokens in target (en) vocabulary: 13824


In [None]:
print(vars(train_data.examples[9]))

{'trg': ['in', 'particular', ',', 'if', 'flax', 'and', 'hemp', 'are', 'now', 'to', 'be', 'integrated', 'into', 'the', 'arable', 'support', 'system', ',', 'iacs', 'will', 'then', 'be', 'fully', 'applied', '.'], 'src': ['wenn', 'es', 'jetzt', 'vor', 'allem', 'bei', 'flachs', 'und', 'hanf', 'zu', 'einer', 'integration', 'in', 'das', 'ackerflächensystem', 'kommt', ',', 'wird', 'in', 'diesem', 'fall', 'das', 'invekos', '-', 'system', 'in', 'vollem', 'umfang', 'angewendet', '.']}


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [None]:
class Encoder_simple(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout_prob, bidirectional=False):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout_prob = dropout_prob
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           dropout=dropout_prob, bidirectional=bidirectional)

        self.dropout = nn.Dropout(p=dropout_prob)
        
        
    def forward(self, src):

        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        #output = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]

        if self.bidirectional:

            hidden = hidden.reshape(self.n_layers, 2, -1, self.hid_dim)
            hidden = hidden.transpose(1, 2).reshape(self.n_layers, -1, 2 * self.hid_dim)

            cell = cell.reshape(self.n_layers, 2, -1, self.hid_dim)
            cell = cell.transpose(1, 2).reshape(self.n_layers, -1, 2 * self.hid_dim)

        return hidden, cell

In [None]:
class Decoder_simple(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout_prob):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout_prob = dropout_prob

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           dropout=dropout_prob)
        
        self.out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, input, hidden, cell):

        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]

        unsqueeze_input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(unsqueeze_input))
        #embedded = [1, batch size, emb dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        #output = [1, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]

        prediction = self.out(output.squeeze(0))     
         
        return prediction, hidden, cell

In [None]:
class Translator_simple(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing):

        #src = [src len, batch size]
        #trg = [trg len, batch size]
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0,:]

        for token in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[token] = output
            teacher_force = random.random() < teacher_forcing
            top = output.argmax(-1) 
            input = trg[token] if teacher_force else top
        return outputs

In [None]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_layers = 512
layers = 2
encoder_dropout_prob = 0.5
decoder_dropout_prob = 0.5
bidirectional = True

encoder = Encoder_simple(input_dim, encoder_embedding_dim, hidden_layers//2, layers, encoder_dropout_prob, bidirectional=bidirectional)
decoder = Decoder_simple(output_dim, decoder_embedding_dim, hidden_layers, layers, decoder_dropout_prob)

model = Translator_simple(encoder, decoder, device).to(device)

In [None]:
print(input_dim)
print(output_dim)

22712
13824


In [None]:
def init_weights(m):
    # такая инициализация должна давать лучший результат 
    for name, param in m.named_parameters():
        nn.init.uniform_(param, -0.08, 0.08)
        
model.apply(init_weights)

Translator_simple(
  (encoder): Encoder_simple(
    (embedding): Embedding(22712, 256)
    (rnn): LSTM(256, 256, num_layers=2, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder_simple(
    (embedding): Embedding(13824, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=13824, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 22,752,768 trainable parameters


In [None]:
def delete_eos(tokens_iter):
    for token in tokens_iter:
        if token == '<eos>':
            break
        yield token

def remove_tech_tokens(tokens_iter, tokens_to_remove=['<sos>', '<unk>', '<pad>']):
    return [x for x in tokens_iter if x not in tokens_to_remove]

def generate_translation(src, trg, model, TRG_vocab):
    model.eval()
    # запускаем без teacher_forcing
    output = model(src, trg, 0)
    # удаляем первый токен и выбираем лучшее слово
    output = output[1:].argmax(-1)
    #print(output)
    original = remove_tech_tokens(delete_eos([TRG_vocab.itos[x] for x in list(trg[:,0].cpu().numpy())]))
    generated = remove_tech_tokens(delete_eos([TRG_vocab.itos[x] for x in list(output[:, 0].cpu().numpy())]))
    
    print('Правильный перевод: {}'.format(' '.join(original)))
    print('Перевод модели: {}'.format(' '.join(generated)))

def get_text(x, TRG_vocab):
     generated = remove_tech_tokens(delete_eos([TRG_vocab.itos[elem] for elem in list(x)]))
     return generated

from nltk.translate.bleu_score import corpus_bleu

def get_bleu(iterator):
    original_text = []
    generated_text = []
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            # запускаем без teacher_forcing
            output = model(src, trg, 0)
            # удаляем первый токен и выбираем лучшее слово
            output = output[1:].argmax(-1)
            # собираем данные для подсчета BLEU
            original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])
            generated_text.extend([get_text(x, TRG.vocab) for x in output.detach().cpu().numpy().T])
    bleu = corpus_bleu([[text] for text in original_text], generated_text) * 100
    return bleu

In [None]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip, epoch):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        teacher_forcing = 1 - epoch * 0.25
        if teacher_forcing < 0.6:
            teacher_forcing = 0.6
        output = model(src, trg, teacher_forcing = teacher_forcing)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
epochs = 10
clip = 1

best_valid_loss = float('inf')
best_valid_bleu = 0
for epoch in tqdm.notebook.tqdm(range(epochs)):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, clip, epoch)
    valid_loss = evaluate(model, valid_iterator, criterion)
    valid_bleu = get_bleu(test_iterator)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/NLP/simple_LSTM_en_ru.pt')
    if valid_bleu > best_valid_bleu:
        best_valid_bleu= valid_bleu
        torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/NLP/simple_LSTM_bleu_en_ru.pt')
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\t Val. BLEU: {valid_bleu:.3f}')

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/NLP/simle_LSTM_last_en_ru.pt')

In [None]:
model_name = 'simple_LSTM_bleu_en_ru.pt'
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/' + model_name))

In [None]:
def translate_batch(iterator):
    batch = next(iter(iterator))
    for idx in range(10):
        src = batch.src[:, idx:idx+1]
        trg = batch.trg[:, idx:idx+1]
        generate_translation(src, trg, model, TRG.vocab)

In [None]:
translate_batch(test_iterator)

In [None]:
print("Train BLEU = ", get_bleu(train_iterator))
print("Valid BLEU = ", get_bleu(valid_iterator))
print("Test BLEU = ", get_bleu(test_iterator))

In [None]:
def translate(data):
    with open('example.txt', 'w') as file:
        file.write(str(data*2+','+data))
    test_dataset = torchtext.legacy.data.TabularDataset(
        path='example.txt',
        format='csv',
        fields=[('src', SRC), ('trg', TRG)]
    )
    iterator = BucketIterator(
        test_dataset, 
        batch_size = 1, 
        device = device,
        sort_key=_len_sort_key
    )
    generated_text = []
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            # запускаем без teacher_forcing
            output = model(src, trg, 0)
            # удаляем первый токен и выбираем лучшее слово
            output = output[1:].argmax(-1)
            # собираем данные для подсчета BLEU
            generated_text.extend([get_text(x, TRG.vocab) for x in output.detach().cpu().numpy().T])
            generated_text=(' '.join(generated_text[0])[:-2]+'.').capitalize()
    translation = 'Перевод модели: {}'.format(generated_text)
    return translation

In [None]:
data='На всей территории гостевого дома Jam работает бесплатный Wi-Fi.'
translate(data)

'Перевод модели: Features free wifi throughout the property.'