<a href="https://colab.research.google.com/github/WhiteAndBlackFox/nlp/blob/seq2seq_att/seq2seq_with_attation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Машинный перевод. Модель seq2seq и механизм внимания

## Доставляем и импортируем библиотеки

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import pandas as pd

import unicodedata
import re
import numpy as np
import os
import io
import time

## Дополнительные функции

In [2]:
def preprocess_sentence(w):
    w = w.lower().strip()
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)
    w = w.strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    
    return w

# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]
    
    return zip(*word_pairs)

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='')
    lang_tokenizer.fit_on_texts(lang)
    
    tensor = lang_tokenizer.texts_to_sequences(lang)
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')
    
    return tensor, lang_tokenizer

def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)
    
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer


## Скачивание и подготовка датасета

In [3]:
!wget http://www.manythings.org/anki/rus-eng.zip
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng
path_to_file = 'rus-eng/rus.txt'

--2022-07-10 13:07:37--  http://www.manythings.org/anki/rus-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14819554 (14M) [application/zip]
Saving to: ‘rus-eng.zip.3’


2022-07-10 13:07:38 (20.7 MB/s) - ‘rus-eng.zip.3’ saved [14819554/14819554]

mkdir: cannot create directory ‘rus-eng’: File exists
Archive:  rus-eng.zip
replace rus-eng/rus.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: rus-eng/rus.txt         
replace rus-eng/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: rus-eng/_about.txt      


In [4]:
en, ru = create_dataset(path_to_file, None)
print(en[0])
print(ru[0])

<start> go . <end>
<start> марш ! <end>


In [5]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [6]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor,
                                                                                                target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [7]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [8]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape


(TensorShape([64, 15]), TensorShape([64, 11]))

## Модели


### Энкодер

In [60]:
class Encoder(tf.keras.Model):
    def __init__(self, type_model, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.type_model = type_model

        if self.type_model == 'nonattation':
          return_sequences = False
        else: 
          return_sequences = True
        
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=return_sequences,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

Декодеры

In [63]:
class Decoder(tf.keras.Model):
  """
    Декодер без внимания
  """
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden, enc_output):
      # enc_output shape == (batch_size, max_length, hidden_size)
      
      # x shape after passing through embedding == (batch_size, 1, embedding_dim)
      x = self.embedding(x)
      
      # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
      
      # passing the concatenated vector to the GRU
      output, state = self.gru(x, initial_state=hidden)

      # output shape == (batch_size * 1, hidden_size)
      output = tf.reshape(output, (-1, output.shape[2]))

      # output shape == (batch_size, vocab)
      x = self.fc(output)
      
      return x, state, None

In [11]:
class BahdanauAttention(tf.keras.layers.Layer):
    """
      Декодер с вниманием на основе алгоритма Bahdanau
    """
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

In [62]:
class DecoderAttention(tf.keras.Model):
    """
      стандартный декодер с вниманием
    """
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(DecoderAttention, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights

### Объеденим в модель декодеры и энкодеры

In [72]:
class Translator():
  def __init__(self, type_model, **kwargs):
    """
      Создаем класс для перевода
    """    

    # У нас всего 2 модели! 
    assert(type_model not in ["nonattation", "attention"], "Неверно указан тип модели!")

    # Все что пришло для модели определяем в self, чтобы видно было в классе
    self.__dict__.update(kwargs)
    
    # Опрделяем основные параметры для классов
    self.type_model = type_model
    self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "ckpt")
    self.optimizer = tf.keras.optimizers.Adam()
    self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

    self.encoder = Encoder(self.type_model, self.vocab_inp_size, self.embedding_dim, self.units, self.batch_size)
    
    if self.type_model == 'nonattation':  
      self.decoder = Decoder(self.vocab_tar_size, self.embedding_dim, self.units, self.batch_size)
    else:
      self.decoder = DecoderAttention(self.vocab_tar_size, self.embedding_dim, self.units, self.batch_size)
    
    self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
                                              encoder=self.encoder,
                                              decoder=self.decoder)
  
  def loss_function(self, real, pred):
    """
      Функция потери
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = self.loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

  def train_step(self, inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = self.encoder(inp, enc_hidden)
    
        dec_hidden = enc_hidden
    
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * self.batch_size, 1)
    
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
        
            loss += self.loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = self.encoder.trainable_variables + self.decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    self.optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

  def train_model(self, epochs):
    for epoch in range(epochs):
        start = time.time()

        enc_hidden = self.encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(self.dataset.take(self.steps_per_epoch)):
            batch_loss = self.train_step(inp, targ, enc_hidden)
            total_loss += batch_loss
    
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                              batch,
                                                              batch_loss.numpy()))
        
        # saving (checkpoint) the model every 2 epochs
        if (epoch + 1) % 2 == 0:
            self.checkpoint.save(file_prefix = self.checkpoint_prefix)
    
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / self.steps_per_epoch))
        print('Time taken for {} epoch {} sec\n'.format(epoch + 1, time.time() - start))

  def evaluate(self, sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, self.units))]
    enc_output, enc_hidden = self.encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = self.decoder(dec_input, dec_hidden, enc_output)
        
        if self.type_model == 'attention':
          # storing the attention weights to plot later on
          attention_weights = tf.reshape(attention_weights, (-1, ))
          attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
          return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)


    return result, sentence, attention_plot
  
  # function for plotting the attention weights
  def plot_attention(self, attention, sentence, predicted_sentence):
      fig = plt.figure(figsize=(10,10))
      ax = fig.add_subplot(1, 1, 1)
      ax.matshow(attention, cmap='viridis')
  
      fontdict = {'fontsize': 14}
  
      ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
      ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
  
      ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
      ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
  
      plt.show()
  
  def resore_last_checkpoint(self):
    self.checkpoint.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
  
  def translate(self, sentence, with_plot=False):

    result, sentence, attention_plot = self.evaluate(sentence)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    if with_plot and self.type_model == 'attention':
      attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
      self.plot_attention(attention_plot, sentence.split(' '), result.split(' '))


  assert(type_model not in ["nonattation", "attention"], "Неверно указан тип модели!")


In [74]:
transtalors = {}
for type_model in ["nonattation", "attention"]:
  checkpoint_dir = f'./training_nmt_{type_model}_checkpoints'
  
  translator = Translator(type_model=type_model,
                        dataset=dataset,
                        vocab_inp_size=vocab_inp_size, 
                        vocab_tar_size=vocab_tar_size, 
                        embedding_dim=embedding_dim, 
                        units=units, 
                        batch_size=BATCH_SIZE, 
                        steps_per_epoch=steps_per_epoch,
                        checkpoint_dir=checkpoint_dir)
  
  translator.train_model(3)
  
  transtalors[type_model] = translator

  translator.translate('Сегодня хороший день.')

Epoch 1 Batch 0 Loss 4.7152
Epoch 1 Batch 100 Loss 2.0506
Epoch 1 Batch 200 Loss 1.9572
Epoch 1 Batch 300 Loss 1.6218
Epoch 1 Batch 400 Loss 1.4508
Epoch 1 Batch 500 Loss 1.5353
Epoch 1 Batch 600 Loss 1.2884
Epoch 1 Batch 700 Loss 1.2378
Epoch 1 Batch 800 Loss 1.2887
Epoch 1 Batch 900 Loss 1.2584
Epoch 1 Batch 1000 Loss 1.1949
Epoch 1 Batch 1100 Loss 1.1785
Epoch 1 Batch 1200 Loss 1.1477
Epoch 1 Loss 1.4832
Time taken for 1 epoch 168.88233852386475 sec

Epoch 2 Batch 0 Loss 0.9866
Epoch 2 Batch 100 Loss 0.9460
Epoch 2 Batch 200 Loss 0.9499
Epoch 2 Batch 300 Loss 0.8868
Epoch 2 Batch 400 Loss 0.9269
Epoch 2 Batch 500 Loss 0.8351
Epoch 2 Batch 600 Loss 0.7819
Epoch 2 Batch 700 Loss 0.7550
Epoch 2 Batch 800 Loss 0.8419
Epoch 2 Batch 900 Loss 0.6820
Epoch 2 Batch 1000 Loss 0.7712
Epoch 2 Batch 1100 Loss 0.7107
Epoch 2 Batch 1200 Loss 0.7345
Epoch 2 Loss 0.8024
Time taken for 1 epoch 155.77618312835693 sec

Epoch 3 Batch 0 Loss 0.4743
Epoch 3 Batch 100 Loss 0.5170
Epoch 3 Batch 200 Loss 0.4

## Оценка качества перевода моделей

In [75]:
df = pd.DataFrame([len(str_val[str_val != 0]) for str_val in input_tensor_val], columns=['num_tokens'])
print(df['num_tokens'].unique())
df['index'] = df.index
df.head()

[ 5  7  6  8  9  4 10 11 14 12 13]


Unnamed: 0,num_tokens,index
0,5,0
1,7,1
2,7,2
3,7,3
4,7,4


In [77]:
for i in range(4, 14):
    indexes = df.loc[df['num_tokens']==i].sample(
        2, replace=True)['index'].values if len(df.loc[df['num_tokens']==i]) > 0 else []
    print(f'Tokens count: {i-2}')
    print(f'-----------------------------------------------------')
    for index in indexes:
        text_in = " ".join([inp_lang.index_word[t] if t > 2
                            else '' for t in input_tensor_val[index][input_tensor_val[index] != 0]])
        text_targ = " ".join([targ_lang.index_word[t] if t > 2
                              else '' for t in target_tensor_val[index][target_tensor_val[index] != 0]])
        print(f'Input: {text_in}')
        print(f'Target: {text_targ}')
        print(f'---------------------------------')
        transtalors["nonattation"].translate(text_in)
        transtalors["attention"].translate(text_in)
        print()
    print(f'-----------------------------------------------------')
    print()

Tokens count: 2
-----------------------------------------------------
Input:  отдай . 
Target:  hand it over . 
---------------------------------
Input: <start> отдай . <end>
Predicted translation: give me a kiss . <end> 
Input: <start> отдай . <end>
Predicted translation: give me . <end> 

Input:  прохладно . 
Target:  it is a little cold . 
---------------------------------
Input: <start> прохладно . <end>
Predicted translation: it's getting cloudy . <end> 
Input: <start> прохладно . <end>
Predicted translation: it's a morning . <end> 

-----------------------------------------------------

Tokens count: 3
-----------------------------------------------------
Input:  надень шлем . 
Target:  put your helmet on . 
---------------------------------
Input: <start> надень шлем . <end>
Predicted translation: put your pajamas on . <end> 
Input: <start> надень шлем . <end>
Predicted translation: put your helmet on . <end> 

Input:  мы дипломатичны . 
Target:  we're diplomatic . 
------------

## Вывод

Модель без внимания немного уступает модели с вниманием, но не скащал бы что сильно.