In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

from gensim.models import word2vec
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn import model_selection

import re
import os
import sys
import pickle
import collections
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from IPython import display
%matplotlib inline

Using TensorFlow backend.


In [2]:
use_cuda = torch.cuda.is_available()

## Prepare data

In [4]:
# http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#loading-data-files
lines = open('data/tatoeba/eng-fra.txt').read().strip().split('\n')
pairs = [l.split('\t') for l in lines]
pairs = [[text_to_word_sequence(re.sub(r"\u202f|\u2009", r"", s)) for s in p] for p in pairs]

In [5]:
X = [p[0] for p in pairs]
y = [p[1] for p in pairs]

In [6]:
# For ease of training, only keep sentences shorter than 20 words
X_len = [len(sentence) for sentence in X]
y_len = [len(sentence) for sentence in y]

min_len = 2
max_len = 20

X_to_keep_ix = np.where((np.array(X_len) >= min_len) & (np.array(X_len) <= max_len))
y_to_keep_ix = np.where((np.array(y_len) >= min_len) & (np.array(y_len) <= max_len))

to_keep_ix = list(set(np.intersect1d(X_to_keep_ix, y_to_keep_ix)))
len(to_keep_ix) / len(X)

0.9962824457826004

In [7]:
X_small = np.array(X)[to_keep_ix]
y_small = np.array(y)[to_keep_ix]

len(X_small), len(y_small)

(135337, 135337)

In [8]:
# Sanity check
X_len = [len(sentence) for sentence in X_small]
y_len = [len(sentence) for sentence in y_small]

min(X_len), max(X_len), min(y_len), max(y_len)

(2, 20, 2, 20)

In [9]:
X_small[:5], y_small[:5]

(array([list(['i', 'see']), list(['i', 'won']), list(['i', 'won']),
        list(['oh', 'no']), list(['get', 'up'])], dtype=object),
 array([list(['je', 'comprends']), list(["j'ai", 'gagné']),
        list(['je', "l'ai", 'emporté']), list(['oh', 'non']),
        list(['lève', 'toi'])], dtype=object))

### Create word-to-index mapping

In [10]:
def create_word_to_id_mapping(data, max_vocab_size = 20000):
    counter = collections.Counter(np.hstack(data))
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    # Pick the most common ones
    count_pairs = count_pairs[:max_vocab_size]

    # Add 'ZERO', 'GO', and 'UNK'
    # It is important to add 'ZERO' in the beginning
    # to make sure zero padding does not interfere with existing words
    count_pairs.insert(0, ('GO', 0))
    count_pairs.insert(0, ('ZERO', 0))
    count_pairs.append(('UNK', 0))

    # Create mapping for both directions
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    id_to_word = dict(zip(range(len(words)), words))
    
    # Map words to indexes
    data_id = [[word_to_id[word] if word in word_to_id else word_to_id['UNK'] for word in sentence] for sentence in data]
    
    return word_to_id, id_to_word, data_id

In [11]:
X_word_to_id, X_id_to_word, X_id = create_word_to_id_mapping(X_small)
y_word_to_id, y_id_to_word, y_id = create_word_to_id_mapping(y_small)

In [12]:
print(len(X_id), len(y_id), len(X_word_to_id), len(y_word_to_id))

135337 135337 13512 20003


In [13]:
X_vocab_size, y_vocab_size = len(X_word_to_id), len(y_word_to_id)

In [14]:
print(' '.join([X_id_to_word[i] for i in X_id[0]]))

i see


In [15]:
print(' '.join([y_id_to_word[i] for i in y_id[0]]))

je comprends


### Pad zeros to make sentences equal length

In [16]:
X_id_padded = pad_sequences(X_id, maxlen=max_len, padding='post')
y_id_padded = pad_sequences(y_id, maxlen=max_len, padding='post')

In [9]:
# Sanity check
X_len = [len(sentence) for sentence in X_id_padded]
y_len = [len(sentence) for sentence in y_id_padded]

min(X_len), max(X_len), min(y_len), max(y_len)

(20, 20, 20, 20)

### Leverage pre-trained word vectors

English word vectors downloaded from https://nlp.stanford.edu/projects/glove/

In [18]:
# Code stolen from https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embeddings_index_en = {}
f = open('data/glove.6B/glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index_en[word] = coefs
f.close()

French word vectors downloaded from http://fauconnier.github.io/index.html

In [19]:
embeddings_index_fr = word2vec.KeyedVectors.load_word2vec_format(
    'data/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin', binary=True)

In [20]:
# Map words to pre-trained embeddings
def map_word_to_pretrained_embedding(embeddings_index, embedding_size, word_to_id):
    vocab_size = len(word_to_id)
    embedding_matrix = np.zeros((vocab_size, embedding_size))
    
    # Keep a running count of matched words
    found = 0
    
    for word, i in word_to_id.items():
        if word in embeddings_index:
            embedding_vector = embeddings_index[word]
            embedding_matrix[i] = embedding_vector
            found += 1
        else:
            # Words not found in embedding index will be randomly initialized
            embedding_matrix[i] = np.random.normal(size=(embedding_size, ))

    return embedding_matrix, found

In [21]:
X_embeddings, X_found = map_word_to_pretrained_embedding(embeddings_index_en, 200, X_word_to_id)
X_embeddings.shape, X_found / X_embeddings.shape[0]

((13512, 200), 0.9655121373593842)

In [22]:
y_embeddings, y_found = map_word_to_pretrained_embedding(embeddings_index_fr, 200, y_word_to_id)
y_embeddings.shape, y_found / y_embeddings.shape[0]

((20003, 200), 0.83257511373294)

### Save processed data

In [23]:
pickle_file = 'data/tatoeba/training_data.pickle'

f = open(pickle_file, 'wb')
save = {
    'X_small': X_small,
    'y_small': y_small,
    'X_word_to_id': X_word_to_id,
    'X_id_to_word': X_id_to_word,
    'y_word_to_id': y_word_to_id,
    'y_id_to_word': y_id_to_word,
    'X_id_padded': X_id_padded,
    'y_id_padded': y_id_padded,
    'X_embeddings': X_embeddings,
    'y_embeddings': y_embeddings
}

pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()

In [8]:
# Reload data that was processed last time
pickle_file = 'data/tatoeba/training_data.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    
    X_small = save['X_small']
    y_small = save['y_small']
    X_word_to_id = save['X_word_to_id']
    X_id_to_word = save['X_id_to_word']
    y_word_to_id = save['y_word_to_id']
    y_id_to_word = save['y_id_to_word']
    X_id_padded = save['X_id_padded']
    y_id_padded = save['y_id_padded']
    X_embeddings = save['X_embeddings']
    y_embeddings = save['y_embeddings']
    
    del save

In [10]:
# For testing purposes, only include sentences that start with "I", "you", "he", "she", and "we"
starts_with_subject_pronouns = np.array([
    s[0] in [
        X_word_to_id["i"], X_word_to_id["you"], X_word_to_id["he"],
        X_word_to_id["she"], X_word_to_id["we"]
    ] for s in X_id_padded
])
starts_with_subject_pronouns = np.where(starts_with_subject_pronouns)[0]

X_id_padded, y_id_padded = X_id_padded[starts_with_subject_pronouns], y_id_padded[starts_with_subject_pronouns]
len(X_id_padded), len(y_id_padded)

(51054, 51054)

### Split data into training and test sets

In [11]:
X_id_padded_train, X_id_padded_test, y_id_padded_train, y_id_padded_test = model_selection.train_test_split(
    X_id_padded, y_id_padded, test_size=0.1, random_state=123456)

In [12]:
[e.shape for e in (X_id_padded_train, X_id_padded_test, y_id_padded_train, y_id_padded_test)]

[(45948, 20), (5106, 20), (45948, 20), (5106, 20)]

In [13]:
print(' '.join([X_id_to_word[i] for i in X_id_padded_train[0]]))

we want peace in the world ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO


In [14]:
print(' '.join([y_id_to_word[i] for i in y_id_padded_train[0]]))

nous désirons la paix dans le monde ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO ZERO


## Create model

### Encoder

In [15]:
X_embeddings.shape

(13512, 200)

In [16]:
# Convert pre-trained embeddings to a tensor
X_embeddings = torch.FloatTensor(X_embeddings)
y_embeddings = torch.FloatTensor(y_embeddings)

if use_cuda:
    X_embeddings = X_embeddings.cuda()
    y_embeddings = y_embeddings.cuda()

In [17]:
# Create a embedding layer initialized with pre-trained embedding matrix
def create_embedding(init_embeddings, trainable=True):
    vocab_size, embedding_size = init_embeddings.size()
    embedding = nn.Embedding(vocab_size, embedding_size)
    embedding.load_state_dict({'weight': init_embeddings})
    
    if use_cuda:
        embedding = embedding.cuda()
    
    if not trainable:
        for param in embeddings.parameters(): 
            param.requires_grad = False
    
    return embedding, vocab_size, embedding_size

In [18]:
# Check dimensions
create_embedding(X_embeddings)

(Embedding(13512, 200), 13512, 200)

In [19]:
# Create encoder RNN using LSTM
class EncoderRNN(nn.Module):
    def __init__(self, init_embeddings, hidden_size, n_layers=2):
        super(EncoderRNN, self).__init__()
        
        self.embedding, vocab_size, embedding_size = create_embedding(init_embeddings)
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True)
        if use_cuda:
            self.lstm = self.lstm.cuda()
    
    def forward(self, input, states):
        output, states = self.lstm(self.embedding(input), states)
        return output, states
    
    def initHidden(self, batch_size):
        init_hidden_state = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
        init_cell_state = Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))
        
        if use_cuda:
            return (init_hidden_state.cuda(), init_cell_state.cuda())
        else:
            return (init_hidden_state, init_cell_state)

### Decoder

In [20]:
# Randomly initialized weight matrices
def arr(*size):
    return torch.randn(size) / math.sqrt(size[0])

def param(*size):
    if use_cuda:
        return nn.Parameter(arr(*size)).cuda()
    else:
        return nn.Parameter(arr(*size))

In [21]:
# Numpy style dot operation to multiply a 3D matrix with a 2D one
# Based on https://discuss.pytorch.org/t/how-can-i-compute-3d-tensor-2d-tensor-multiplication/639/9
def dot(X, Y):
    return torch.bmm(X, Y.unsqueeze(0).expand(X.size(0), *Y.size()))

$$u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)$$
$$a^t_i = softmax(u^t_i)$$
$$d_t' = \sum_i^{T_A} a^t_i h_i$$

In [22]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, init_embeddings, hidden_size, n_layers=2):
        super(AttnDecoderRNN, self).__init__()
        
        self.embedding, vocab_size, embedding_size = create_embedding(init_embeddings)
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # Define weights and intercepts used in paper 1412.7449
        # to construct the allignment matrix: u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)
        self.W1 = param(hidden_size, hidden_size)
        self.W2 = param(hidden_size, hidden_size)
        self.b = param(hidden_size)
        self.v = param(hidden_size)
        
        # Linear layer to reshape hidden state, concatenated with either the previous true label or prediction,
        # back to the shape of hidden state
        # As the new input to LSTM
        self.new_input = nn.Linear(hidden_size + embedding_size, hidden_size)
        
        # LSTM layers using the new concatenated hidden state as the input
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        
        # Linear layer to reshape data to the shape of output vocabulary
        self.out = nn.Linear(hidden_size, vocab_size)
        
        if use_cuda:
            self.new_input = self.new_input.cuda()
            self.lstm = self.lstm.cuda()
            self.out = self.out.cuda()
    
    def forward(self, input, states, encoder_outputs):
        # u^t_i = v^T tanh(W_1′ h_i + W_2′ d_t)
        W1h = dot(encoder_outputs, self.W1)            # (batch_size, seq_length, hidden_size)
        hidden_state = states[0]                       # (n_layers, batch_size, hidden_size)
        W2d = hidden_state[-1].mm(self.W2)             # (batch_size, hidden_size)
        W1h_W2d = W1h + W2d.unsqueeze(1) + self.b      # (batch_size, seq_length, hidden_size)
        tahn_W1h_W2d = F.tanh(W1h_W2d)                 # (batch_size, seq_length, hidden_size)
        u = (tahn_W1h_W2d * self.v).sum(2)             # (batch_size, seq_length)
        
        # a^t_i = softmax(u^t_i)
        a = F.softmax(u)                               # (batch_size, seq_length)
        
        # d_t' = \sum_i^{T_A} a^t_i h_i
        encoder_outputs_weighted_sum = (a.unsqueeze(2) * encoder_outputs).sum(1)
                                                       # (batch_size, hidden_size)
        
        # Concatenate with decoder input,
        # which is either the previous true label or prediction
        concat_input = torch.cat((encoder_outputs_weighted_sum, self.embedding(input)), 1)
                                                       # (batch_size, hidden_size + embedding_size)
        
        # Reshape the concatenated input back to the shape of hidden state
        reshaped_input = self.new_input(concat_input)  # (batch_size, hidden_size)
        
        # Feed the new input into the LSTM layer
        output, states = self.lstm(reshaped_input.unsqueeze(0), states)
        output = output.squeeze(0)                     # (batch_size, hidden_size)
        
        # Finally, feed to the output layer
        output = self.out(output)                      # (batch_size, vocab_size)
        output = F.log_softmax(output)                 # (batch_size, vocab_size)
        
        return output, states, a

### Test encoder and decoder

In [23]:
def get_batch(X, y, i, batch_size):
    start = i * batch_size
    end = (i + 1) * batch_size
    return X[start:end], y[start:end]

In [39]:
batch_size = 5
i = 0

X_input, y_input = get_batch(X_id_padded_train, y_id_padded_train, i, batch_size)
X_input, y_input = Variable(torch.from_numpy(X_input).long()), Variable(torch.from_numpy(y_input).long())
X_seq_length, y_seq_length = X_input.size()[1], y_input.size()[1]

In [40]:
hidden_size = 10
encoder = EncoderRNN(X_embeddings, hidden_size)

encoder_states = encoder.initHidden(batch_size)
encoder_states[0].size(), encoder_states[1].size()

(torch.Size([2, 5, 10]), torch.Size([2, 5, 10]))

In [41]:
encoder_outputs, encoder_states = encoder(X_input, encoder_states)
encoder_outputs.size(), encoder_states[0].size(), encoder_states[1].size()

(torch.Size([5, 20, 10]), torch.Size([2, 5, 10]), torch.Size([2, 5, 10]))

In [42]:
decoder = AttnDecoderRNN(y_embeddings, hidden_size)

decoder_states = encoder_states
decoder_input = Variable(torch.LongTensor([X_word_to_id['GO']] * batch_size))

In [43]:
decoder_attentions = np.zeros((batch_size, y_seq_length, y_seq_length))

for i in range(y_seq_length):
    decoder_output, decoder_states, decoder_attention = decoder(decoder_input, decoder_states, encoder_outputs)
    decoder_input = y_input[:, i]
    decoder_attentions[:, i, :] = decoder_attention.data.cpu().numpy()

In [44]:
decoder_output.size(), decoder_states[0].size(), decoder_states[1].size(), decoder_attention.size(), decoder_attentions.shape

(torch.Size([5, 20003]),
 torch.Size([2, 5, 10]),
 torch.Size([2, 5, 10]),
 torch.Size([5, 20]),
 (5, 20, 20))

### Training function

In [24]:
def train(X_input, y_input, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, teacher_forcing_prob=0.5):
    # Initialize variables
    batch_size, X_seq_length = X_input.size()
    y_seq_length = y_input.size()[1]
    
    encoder_states = encoder.initHidden(batch_size)
    decoder_input = Variable(torch.LongTensor([X_word_to_id['GO']] * batch_size))
    if use_cuda:
        decoder_input = decoder_input.cuda()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    # Encode
    encoder_outputs, encoder_states = encoder(X_input, encoder_states)
    decoder_states = encoder_states

    # Decode
    if np.random.random() <= teacher_forcing_prob:
        # Teacher forcing: use the true label as the next decoder input
        for i in range(y_seq_length):
            decoder_output, decoder_states, decoder_attention = decoder(decoder_input, decoder_states, encoder_outputs)
            loss += criterion(decoder_output, y_input[:, i])
            decoder_input = y_input[:, i]
    else:
        # Otherwise, use the previous prediction
        for i in range(y_seq_length):
            decoder_output, decoder_states, decoder_attention = decoder(decoder_input, decoder_states, encoder_outputs)
            loss += criterion(decoder_output, y_input[:, i])
            
            # Generate prediction
            top_value, top_index = decoder_output.data.topk(1)
            decoder_input = Variable(top_index.squeeze(1))
            if use_cuda:
                decoder_input = decoder_input.cuda()
    
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / y_seq_length

### Function to train an epoch

In [25]:
def train_epoch(X, y, batch_size, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, teacher_forcing_prob=0.5):
    total_loss = 0
    
    # loop over batches
    epoch_size = len(X) // batch_size
    
    for i in range(epoch_size):
        X_batch, y_batch = get_batch(X, y, i, batch_size)
        
        X_batch = Variable(torch.from_numpy(X_batch).long())
        y_batch = Variable(torch.from_numpy(y_batch).long())
        
        if use_cuda:
            X_batch, y_batch = X_batch.cuda(), y_batch.cuda()
        
        loss = train(X_batch, y_batch, encoder, decoder, encoder_optimizer,
                     decoder_optimizer, criterion, teacher_forcing_prob)
        
        total_loss += loss
        
    return total_loss / epoch_size

### Evaluate

In [26]:
def evaluate(X_input, encoder, decoder, max_len):
    # Initialize variables
    batch_size, X_seq_length = X_input.size()
    
    encoder_states = encoder.initHidden(batch_size)
    decoder_input = Variable(torch.LongTensor([X_word_to_id['GO']] * batch_size))
    if use_cuda:
        decoder_input = decoder_input.cuda()
    
    # Encode
    encoder_outputs, encoder_states = encoder(X_input, encoder_states)
    decoder_states = encoder_states

    # Decode
    decoded_words = np.zeros((batch_size, max_len))
    decoder_attentions = np.zeros((batch_size, max_len, max_len))
    
    for i in range(max_len):
        decoder_output, decoder_states, decoder_attention = decoder(decoder_input, decoder_states, encoder_outputs)
        
        # Generate prediction
        top_value, top_index = decoder_output.data.topk(1)
        decoded_words[:, i] = top_index.squeeze(1).cpu().numpy()
        decoder_attentions[:, i, :] = decoder_attention.data.cpu().numpy()
        
        # Use the prediction as the next decoder input
        decoder_input = Variable(top_index.squeeze(1))
        if use_cuda:
            decoder_input = decoder_input.cuda()
    
    return decoded_words, decoder_attentions

## Run epochs

In [27]:
epochs = 60
max_len = len(X_id_padded[0])
batch_size = 100
hidden_size = 200
learning_rate = 0.005
teacher_forcing_prob = 0.5

encoder = EncoderRNN(X_embeddings, hidden_size)
decoder = AttnDecoderRNN(y_embeddings, hidden_size)

encoder_optimizer = optim.RMSprop(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.RMSprop(decoder.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()
if use_cuda:
    criterion = criterion.cuda()

In [28]:
encoder, decoder

(EncoderRNN (
   (embedding): Embedding(13512, 200)
   (lstm): LSTM(200, 200, num_layers=2, batch_first=True)
 ), AttnDecoderRNN (
   (embedding): Embedding(20003, 200)
   (new_input): Linear (400 -> 200)
   (lstm): LSTM(200, 200, num_layers=2)
   (out): Linear (200 -> 20003)
 ))

In [29]:
# Randomly pick sentences from test sets for testing
X_ix = [i for i, e in enumerate(X_id_padded_test) if X_word_to_id['UNK'] not in e]
y_ix = [i for i, e in enumerate(y_id_padded_test) if y_word_to_id['UNK'] not in e]
ix = list(set(X_ix).intersection(y_ix))

X_id_padded_test_clean, y_id_padded_test_clean = X_id_padded_test[ix], y_id_padded_test[ix]

np.random.seed(123456)
np.random.shuffle(ix)
ix = ix[:3]

X_test = X_id_padded_test_clean[ix]
y_test = y_id_padded_test_clean[ix]

In [30]:
# Translate test sentences and visualize attention
def translate_tests(X_test, y_test):
    X_test_var = Variable(torch.from_numpy(X_test).long())
    if use_cuda:
        X_test_var = X_test_var.cuda()
    translations, decoder_attentions = evaluate(X_test_var, encoder, decoder, max_len)
    
    input_sentences = []
    target_sentences = []
    output_sentences = []
    
    for t in range(X_test.shape[0]):
        input_sentence = ' '.join([X_id_to_word[ix] for ix in X_test[t] if ix > 0])
        target_sentence = ' '.join([y_id_to_word[ix] for ix in y_test[t] if ix > 0])
        
        # Cut off translations at the first 'ZERO' padding
        first_zero_ix = np.where(translations[t] == 0)[0]
        if len(first_zero_ix) > 0:
            output_sentence = ' '.join([y_id_to_word[ix] for ix in translations[t][:first_zero_ix[0]]])
        else:
            output_sentence = ' '.join([y_id_to_word[ix] for ix in translations[t]])
        
        input_sentences.append(input_sentence)
        target_sentences.append(target_sentence)
        output_sentences.append(output_sentence)
        
    return input_sentences, target_sentences, output_sentences, decoder_attentions

In [31]:
for i in range(epochs):
    print('Epoch:', i)
    
    # Shuffle the training data every epoch to avoid local minima
    np.random.seed(i)
    ix = np.arange(len(X_id_padded_train))
    np.random.shuffle(ix)
    
    X_id_padded_train, y_id_padded_train = X_id_padded_train[ix], y_id_padded_train[ix]
    
    # Print out the first sentence in X and y for sanity check
    print('First sentence in English:', ' '.join([X_id_to_word[ix] for ix in X_id_padded_train[0] if ix > 0]))
    print('First sentence in French:', ' '.join([y_id_to_word[ix] for ix in y_id_padded_train[0] if ix > 0]))    
    
    # Train an epoch
    train_loss = train_epoch(X_id_padded_train, y_id_padded_train, batch_size,
                             encoder, decoder, encoder_optimizer,
                             decoder_optimizer, criterion, teacher_forcing_prob)
    
    print('\nTraining loss:', train_loss)
    
    # Save checkpoint
    torch.save(encoder, 'output/encoder_' + str(i))
    torch.save(decoder, 'output/decoder_' + str(i))
    
    # Evaluate
    # Translate test sentences
    input_sentences, target_sentences, output_sentences, decoder_attentions = translate_tests(X_test, y_test)
    
    for j in range(len(input_sentences)):
        print('\nTranslation of', input_sentences[j], ':', output_sentences[j])
        print('Actual translation:', target_sentences[j])

Epoch: 0
First sentence in English: she forgot that she had promised to call him last night
First sentence in French: elle a oublié qu'elle avait promis de l'appeler la nuit passée

Training loss: 2.123671884058869

Translation of i don't know what to do : je ne pas pas que vous
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne que vous que que vous
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je pense que vous que que que que vous vous
Actual translation: je n'aurais jamais pensé que j'aurais à entretenir une famille aussi grande
Epoch: 1
First sentence in English: you can't do this to us
First sentence in French: tu ne peux pas nous faire ça


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "



Training loss: 1.7422179851656636

Translation of i don't know what to do : je ne sais pas
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne veux pas de de
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai pas pensé de je ne la la la
Actual translation: je n'aurais jamais pensé que j'aurais à entretenir une famille aussi grande
Epoch: 2
First sentence in English: i can't imagine living like that
First sentence in French: je ne peux pas imaginer de vivre ainsi

Training loss: 1.5945148443084927

Translation of i don't know what to do : je ne sais pas
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne voulais pas que vous
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai jamais pensé que je de de de de
Actual translati


Training loss: 0.7978572899479746

Translation of i don't know what to do : je ne sais pas quoi
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne ai jamais te voir
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai jamais pensé que je serait acheter une nouvelle année
Actual translation: je n'aurais jamais pensé que j'aurais à entretenir une famille aussi grande
Epoch: 16
First sentence in English: she wiped away her tears
First sentence in French: elle essuya ses larmes

Training loss: 0.7677656242270868

Translation of i don't know what to do : je ne sais pas quoi
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne suis pas qu'à te
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai jamais pensé que je serait petite pe


Training loss: 0.5820087414681268

Translation of i don't know what to do : je ne sais quoi quoi
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne fais jamais jamais vous voir
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai jamais pensé que je serais emprunter emprunter une nouvelle année année
Actual translation: je n'aurais jamais pensé que j'aurais à entretenir une famille aussi grande
Epoch: 29
First sentence in English: i don't feel like singing
First sentence in French: je ne suis pas d'humeur à chanter

Training loss: 0.566269355869501

Translation of i don't know what to do : je ne sais pas quoi
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne jamais jamais jamais vous mettre
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a la


Training loss: 0.5017045326482235

Translation of i don't know what to do : je ne sais pas
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne fais jamais jamais pour voir
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai jamais pensé que je voudrais une d'une femme comme une fille
Actual translation: je n'aurais jamais pensé que j'aurais à entretenir une famille aussi grande
Epoch: 42
First sentence in English: i will go to new york next week
First sentence in French: j'irai à new york la semaine prochaine

Training loss: 0.4951638074222477

Translation of i don't know what to do : je ne sais pas quoi faire
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne choisit jamais jamais vous vous
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a la


Training loss: 0.4522622823195785

Translation of i don't know what to do : je ne sais pas quoi faire
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne ai jamais de te voir
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such a large family : je n'ai jamais pensé que je me parler pour une fille aussi fille
Actual translation: je n'aurais jamais pensé que j'aurais à entretenir une famille aussi grande
Epoch: 55
First sentence in English: i have no idea what that guy is thinking
First sentence in French: je n'ai aucune idée de ce que pense ce type

Training loss: 0.454637467523546

Translation of i don't know what to do : je ne sais pas quoi faire
Actual translation: je ne sais que faire

Translation of i never get to see you anymore : je ne dis jamais de vous voir
Actual translation: je n'ai plus l'occasion de te voir

Translation of i never thought i would have to support such 

## Test model

In [32]:
# In the test data, for each input sentence length, randomly selects 5 sentences
X_id_padded_test_sent_len = np.count_nonzero(X_id_padded_test_clean, axis=1)

for i in range(min(X_id_padded_test_sent_len), max(X_id_padded_test_sent_len) + 1):
    ix = np.where(X_id_padded_test_sent_len == i)[0]
    
    np.random.seed(123456)
    np.random.shuffle(ix)
    ix = ix[:5]
    
    if i == min(X_id_padded_test_sent_len):
        X_test = X_id_padded_test_clean[ix]
        y_test = y_id_padded_test_clean[ix]
    else:
        X_test = np.append(X_test, X_id_padded_test_clean[ix], axis=0)
        y_test = np.append(y_test, y_id_padded_test_clean[ix], axis=0)

In [33]:
# Generate translations
input_sentences, target_sentences, output_sentences, decoder_attentions = translate_tests(X_test, y_test)

In [34]:
test_translations = pd.DataFrame({
    'input_sentence': input_sentences,
    'ground_truth_translation': target_sentences,
    'model_translation': output_sentences
})

test_translations = test_translations[['input_sentence', 'model_translation', 'ground_truth_translation']]
test_translations

Unnamed: 0,input_sentence,model_translation,ground_truth_translation
0,i disagree,je n'en suis pas opposé dessus,je ne suis pas d'accord
1,we try,nous sommes préoccupés,on essaye
2,i refuse,je refuse,je refuse
3,i promised,je refuse de,j'ai promis
4,you cheated,vous avez triché,tu as triché
5,we need water,nous avons besoin d'eau,il nous faut de l'eau
6,she is kind,elle est puissante,elle est gentille
7,we shared everything,nous avons tout perdu,nous avons tout partagé
8,i totally forgot,j'ai complètement oublié,j'ai complètement oublié
9,i felt awful,je me suis senti affreusement mal,je me suis sentie affreusement mal


In [36]:
test_translations.to_csv('output/test_translations.csv')