In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy.data import Field, BucketIterator

import spacy
from tqdm.notebook import tqdm
import tqdm
import random
import math
import time
import numpy as np

import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

from nltk.tokenize import WordPunctTokenizer

In [2]:
SEED = 666

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
!ls '/content/drive/MyDrive/Colab Notebooks/NLP'

Translation_Project_simple_LSTM.ipynb


In [6]:
!wget https://drive.google.com/uc?id=1NWYqJgeG_4883LINdEjKUr6nLQPY6Yb_ -O data.txt

--2021-07-15 12:28:40--  https://drive.google.com/uc?id=1NWYqJgeG_4883LINdEjKUr6nLQPY6Yb_
Resolving drive.google.com (drive.google.com)... 172.217.164.142, 2607:f8b0:4004:836::200e
Connecting to drive.google.com (drive.google.com)|172.217.164.142|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-14-00-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6on1odaa1tmu6l506o2sfaavjgafmb1v/1626352050000/16549096980415837553/*/1NWYqJgeG_4883LINdEjKUr6nLQPY6Yb_ [following]
--2021-07-15 12:28:40--  https://doc-14-00-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/6on1odaa1tmu6l506o2sfaavjgafmb1v/1626352050000/16549096980415837553/*/1NWYqJgeG_4883LINdEjKUr6nLQPY6Yb_
Resolving doc-14-00-docs.googleusercontent.com (doc-14-00-docs.googleusercontent.com)... 142.251.33.193, 2607:f8b0:4004:837::2001
Connecting to doc-14-00-docs.googleusercontent.com (doc-14-00-docs.googleusercontent.com)|142.251.33.19

In [7]:
tokenizer_W = WordPunctTokenizer()

def tokenize_ru(x, tokenizer=WordPunctTokenizer()):
    return tokenizer.tokenize(x.lower())

def tokenize_en(x, tokenizer=WordPunctTokenizer()):
    return tokenizer.tokenize(x.lower())

In [8]:
SRC = Field(tokenize=tokenize_ru,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize=tokenize_en,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)


dataset = torchtext.legacy.data.TabularDataset(
    path='data.txt',
    format='tsv',
    fields=[('trg', TRG), ('src', SRC)]
)

In [9]:
print(len(dataset.examples))
print(dataset.examples[0].src)
print(dataset.examples[0].trg)

50000
['отель', 'cordelia', 'расположен', 'в', 'тбилиси', ',', 'в', '3', 'минутах', 'ходьбы', 'от', 'свято', '-', 'троицкого', 'собора', '.']
['cordelia', 'hotel', 'is', 'situated', 'in', 'tbilisi', ',', 'a', '3', '-', 'minute', 'walk', 'away', 'from', 'saint', 'trinity', 'church', '.']


In [10]:
train_data, valid_data, test_data = dataset.split(split_ratio=[0.8, 0.15, 0.05])

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 40000
Number of validation examples: 2500
Number of testing examples: 7500


In [11]:
SRC.build_vocab(dataset, min_freq=2)
TRG.build_vocab(dataset, min_freq=2)

In [12]:
print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ru) vocabulary: 16483
Unique tokens in target (en) vocabulary: 11778


In [13]:
print(vars(train_data.examples[9]))

{'trg': ['there', 'is', 'a', 'concierge', 'service', 'and', '24', '-', 'hour', 'front', 'desk', '.'], 'src': ['гостям', 'предоставляются', 'услуги', 'консьержа', 'и', 'круглосуточной', 'стойки', 'регистрации', '.']}


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [15]:
def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)

In [16]:
class Encoder_simple(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout_prob, bidirectional=False):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout_prob = dropout_prob
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           dropout=dropout_prob, bidirectional=bidirectional)

        self.dropout = nn.Dropout(p=dropout_prob)
        
        
    def forward(self, src):

        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        #output = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]

        if self.bidirectional:

            hidden = hidden.reshape(self.n_layers, 2, -1, self.hid_dim)
            hidden = hidden.transpose(1, 2).reshape(self.n_layers, -1, 2 * self.hid_dim)

            cell = cell.reshape(self.n_layers, 2, -1, self.hid_dim)
            cell = cell.transpose(1, 2).reshape(self.n_layers, -1, 2 * self.hid_dim)

        return hidden, cell

In [17]:
class Decoder_simple(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout_prob):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout_prob = dropout_prob

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, 
                           dropout=dropout_prob)
        
        self.out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(p=dropout_prob)

    def forward(self, input, hidden, cell):

        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]

        unsqueeze_input = input.unsqueeze(0)
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(unsqueeze_input))
        #embedded = [1, batch size, emb dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        #output = [1, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]

        prediction = self.out(output.squeeze(0))     
         
        return prediction, hidden, cell

In [18]:
class Translator_simple(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing):

        #src = [src len, batch size]
        #trg = [trg len, batch size]
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0,:]

        for token in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[token] = output
            teacher_force = random.random() < teacher_forcing
            top = output.argmax(-1) 
            input = trg[token] if teacher_force else top
        return outputs

In [19]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_layers = 512
layers = 2
encoder_dropout_prob = 0.5
decoder_dropout_prob = 0.5
bidirectional = True

encoder = Encoder_simple(input_dim, encoder_embedding_dim, hidden_layers//2, layers, encoder_dropout_prob, bidirectional=bidirectional)
decoder = Decoder_simple(output_dim, decoder_embedding_dim, hidden_layers, layers, decoder_dropout_prob)

model = Translator_simple(encoder, decoder, device).to(device)

In [20]:
print(input_dim)
print(output_dim)

16483
11778


In [21]:
def init_weights(m):
    # такая инициализация должна давать лучший результат 
    for name, param in m.named_parameters():
        nn.init.uniform_(param, -0.08, 0.08)
        
model.apply(init_weights)

Translator_simple(
  (encoder): Encoder_simple(
    (embedding): Embedding(16483, 256)
    (rnn): LSTM(256, 256, num_layers=2, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder_simple(
    (embedding): Embedding(11778, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=11778, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 19,584,770 trainable parameters


In [23]:
def delete_eos(tokens_iter):
    for token in tokens_iter:
        if token == '<eos>':
            break
        yield token

def remove_tech_tokens(tokens_iter, tokens_to_remove=['<sos>', '<unk>', '<pad>']):
    return [x for x in tokens_iter if x not in tokens_to_remove]

def generate_translation(src, trg, model, TRG_vocab):
    model.eval()
    # запускаем без teacher_forcing
    output = model(src, trg, 0)
    # удаляем первый токен и выбираем лучшее слово
    output = output[1:].argmax(-1)
    #print(output)
    original = remove_tech_tokens(delete_eos([TRG_vocab.itos[x] for x in list(trg[:,0].cpu().numpy())]))
    generated = remove_tech_tokens(delete_eos([TRG_vocab.itos[x] for x in list(output[:, 0].cpu().numpy())]))
    
    print('Правильный перевод: {}'.format(' '.join(original)))
    print('Перевод модели: {}'.format(' '.join(generated)))

def get_text(x, TRG_vocab):
     generated = remove_tech_tokens(delete_eos([TRG_vocab.itos[elem] for elem in list(x)]))
     return generated

from nltk.translate.bleu_score import corpus_bleu

def get_bleu(iterator):
    original_text = []
    generated_text = []
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            # запускаем без teacher_forcing
            output = model(src, trg, 0)
            # удаляем первый токен и выбираем лучшее слово
            output = output[1:].argmax(-1)
            # собираем данные для подсчета BLEU
            original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])
            generated_text.extend([get_text(x, TRG.vocab) for x in output.detach().cpu().numpy().T])
    bleu = corpus_bleu([[text] for text in original_text], generated_text) * 100
    return bleu

In [24]:
optimizer = optim.Adam(model.parameters())
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [25]:
def train(model, iterator, optimizer, criterion, clip, epoch):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        teacher_forcing = 1 - epoch * 0.25
        if teacher_forcing < 0.6:
            teacher_forcing = 0.6
        output = model(src, trg, teacher_forcing = teacher_forcing)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
epochs = 10
clip = 1

best_valid_loss = float('inf')
best_valid_bleu = 0
for epoch in tqdm.notebook.tqdm(range(epochs)):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, clip, epoch)
    valid_loss = evaluate(model, valid_iterator, criterion)
    valid_bleu = get_bleu(test_iterator)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/NLP/simple_LSTM.pt')
    if valid_bleu > best_valid_bleu:
        best_valid_bleu= valid_bleu
        torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/NLP/simple_LSTM_bleu.pt')
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\t Val. BLEU: {valid_bleu:.3f}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch: 01 | Time: 2m 40s
	Train Loss: 4.304 | Train PPL:  74.022
	 Val. Loss: 7.695 |  Val. PPL: 2196.461
	 Val. BLEU: 6.495
Epoch: 02 | Time: 2m 41s
	Train Loss: 3.511 | Train PPL:  33.479
	 Val. Loss: 5.728 |  Val. PPL: 307.439
	 Val. BLEU: 10.616
Epoch: 03 | Time: 2m 40s
	Train Loss: 3.412 | Train PPL:  30.324
	 Val. Loss: 5.152 |  Val. PPL: 172.744
	 Val. BLEU: 13.285
Epoch: 04 | Time: 2m 41s
	Train Loss: 3.154 | Train PPL:  23.428
	 Val. Loss: 5.193 |  Val. PPL: 180.047
	 Val. BLEU: 15.778
Epoch: 05 | Time: 2m 41s
	Train Loss: 3.006 | Train PPL:  20.206
	 Val. Loss: 5.023 |  Val. PPL: 151.881
	 Val. BLEU: 16.915
Epoch: 06 | Time: 2m 41s
	Train Loss: 2.882 | Train PPL:  17.854
	 Val. Loss: 4.918 |  Val. PPL: 136.683
	 Val. BLEU: 17.643
Epoch: 07 | Time: 2m 41s
	Train Loss: 2.752 | Train PPL:  15.668
	 Val. Loss: 4.912 |  Val. PPL: 135.844
	 Val. BLEU: 18.877
Epoch: 08 | Time: 2m 41s
	Train Loss: 2.645 | Train PPL:  14.089
	 Val. Loss: 4.916 |  Val. PPL: 136.445
	 Val. BLEU: 19.489


In [27]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/NLP/simle_LSTM_last.pt')

In [28]:
model_name = 'simple_LSTM_bleu.pt'
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/' + model_name))

<All keys matched successfully>

In [29]:
def translate_batch(iterator):
    batch = next(iter(iterator))
    for idx in range(10):
        src = batch.src[:, idx:idx+1]
        trg = batch.trg[:, idx:idx+1]
        generate_translation(src, trg, model, TRG.vocab)

In [30]:
translate_batch(test_iterator)

Правильный перевод: laundry facilities are on site .
Перевод модели: laundry facilities are available .
Правильный перевод: guests can enjoy the on - site restaurant .
Перевод модели: guests can enjoy the on - site restaurant .
Правильный перевод: there is equipped fully equipped kitchenette with fridge and the bathroom comes with a shower .
Перевод модели: the bathroom comes with a shower .
Правильный перевод: free wifi access is available .
Перевод модели: free wi - fi access is available .
Правильный перевод: there is a sandbox in the yard .
Перевод модели: other facilities are available .
Правильный перевод: free private parking is available on site .
Перевод модели: free private parking is available on site .
Правильный перевод: some rooms have a balcony .
Перевод модели: some rooms have a balcony .
Правильный перевод: private parking is available on site free of charge .
Перевод модели: free private parking is available on site .
Правильный перевод: free parking is available on s

In [31]:
print("Train BLEU = ", get_bleu(train_iterator))
print("Valid BLEU = ", get_bleu(valid_iterator))
print("Test BLEU = ", get_bleu(test_iterator))

Train BLEU =  23.059531598687418
Valid BLEU =  20.397783249434582
Test BLEU =  20.812546458722938


In [32]:
def translate(data):
    with open('example.txt', 'w') as file:
        file.write(str(data*2+','+data))
    test_dataset = torchtext.legacy.data.TabularDataset(
        path='example.txt',
        format='csv',
        fields=[('trg', TRG), ('src', SRC)]
    )
    iterator = BucketIterator(
        test_dataset, 
        batch_size = 1, 
        device = device,
        sort_key=_len_sort_key
    )
    generated_text = []
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            # запускаем без teacher_forcing
            output = model(src, trg, 0)
            # удаляем первый токен и выбираем лучшее слово
            output = output[1:].argmax(-1)
            # собираем данные для подсчета BLEU
            generated_text.extend([get_text(x, TRG.vocab) for x in output.detach().cpu().numpy().T])
            generated_text=(' '.join(generated_text[0])[:-2]+'.').capitalize()
    translation = 'Перевод модели: {}'.format(generated_text)
    return translation

In [33]:
data='На всей территории гостевого дома Jam работает бесплатный Wi-Fi.'
translate(data)

'Перевод модели: Features free wifi throughout the property.'