## Note

Done basic steps, but how to improve performance (not good as [tutorial](https://www.youtube.com/watch?v=EoGUlvhRYpk) - non Attention):
* Apply beam search [pcyin Github](https://github.com/pcyin/pytorch_basic_nmt)
* padding base on [likarajo Github](https://github.com/likarajo/language_translation)
* use pretrained word embedding [likarajo Github](https://github.com/likarajo/language_translation)
* Add Attention into model (next [tutorial](https://www.youtube.com/watch?v=sQUqQddQtB4))

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# !pip install spacy -q

In [None]:
# !python -m spacy download fr_core_news_sm -q
# !python -m spacy download en_core_web_sm -q
# !python -m spacy download de_core_news_sm -q

2023-02-05 22:26:58.714744: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
2023-02-05 22:27:15.574698: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

import spacy
import nltk
from tqdm import tqdm

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

## Data

### Load data

In [None]:
!gdown --id 19WMw9e1J7EELfTeGB0k8rIbksudEg6Kk

Downloading...
From: https://drive.google.com/uc?id=19WMw9e1J7EELfTeGB0k8rIbksudEg6Kk
To: /content/eng-fra.txt
100% 9.54M/9.54M [00:00<00:00, 49.9MB/s]


In [None]:
SOS_token = 0
EOS_token = 1
PAD_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.max_len = 0
        self.word2index = {}
        self.word2count = {}
        self.index2word = {2:"<PAD>", 0: "<SOS>", 1: "<EOS>"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index: # if not in dict:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else: # count++ if word already in dict
            self.word2count[word] += 1

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1='eng', lang2='fra', reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    # /content/gdrive/MyDrive/Colab Notebooks/eaai24/eng-fra.txt
    lines = open('./%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 128*100

def filterPair(p):
  # p: a pair of lang
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.max_len = max(input_lang.max_len, len(pair[0]))
        input_lang.addSentence(pair[0])
        output_lang.max_len = max(output_lang.max_len, len(pair[1]))
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 256*2

### Preprocess

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    input_tensor = input_tensor.permute(1, 0)
    pad = (0, MAX_LENGTH - input_tensor.shape[1])
    input_tensor = F.pad(input_tensor, pad, "constant", PAD_token)

    target_tensor = tensorFromSentence(output_lang, pair[1])
    target_tensor = target_tensor.permute(1, 0)
    pad = (0, MAX_LENGTH - target_tensor.shape[1])
    target_tensor = F.pad(target_tensor, pad, "constant", PAD_token)
    # output.shape = (512)
    return (input_tensor.permute(1, 0).squeeze(), target_tensor.permute(1, 0).squeeze())

In [None]:
# pair = random.choice(pairs)
# print(pair)
# print(len(tensorsFromPair(pair)))
# print(tensorsFromPair(pair)[0].shape, tensorsFromPair(pair)[1].shape)
# print(tensorsFromPair(pair)[0][:20])

['compare tes reponses avec celles du professeur .', 'compare your answers with the teacher s .']
2
torch.Size([512, 1]) torch.Size([512, 1])


In [None]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, input_lang, output_lang, pairs):
    self.input_lang = input_lang
    self.output_lang = output_lang
    self.pairs = pairs
    self.MAX_LENGTH = 512
  
  def __len__(self):
    return len(self.pairs)
  
  def indexesFromSentence(self, lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

  def tensorFromSentence(self, lang, sentence):
    indexes = self.indexesFromSentence(lang, sentence)
    indexes.insert(0, SOS_token)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

  def tensorsFromPair(self, pair):
    input_tensor = self.tensorFromSentence(self.input_lang, pair[0])  # shape = (seq_len, 1)
    input_tensor = input_tensor.permute(1, 0) # shape = (1, seq_len)
    pad = (0, self.MAX_LENGTH - input_tensor.shape[1])
    input_tensor = F.pad(input_tensor, pad, "constant", PAD_token)  # shape = (MAX_LENGTH, 1)

    target_tensor = self.tensorFromSentence(self.output_lang, pair[1])
    target_tensor = target_tensor.permute(1, 0)
    pad = (0, self.MAX_LENGTH - target_tensor.shape[1])
    target_tensor = F.pad(target_tensor, pad, "constant", PAD_token)
    # out.shape = [512]
    return (input_tensor.permute(1, 0).squeeze(), target_tensor.permute(1, 0).squeeze())

  def __getitem__(self, index):
    pair = self.pairs[index]
    return (self.tensorsFromPair(pair), pair)

In [None]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
dataset = MyDataset(input_lang, output_lang, pairs)

Reading lines...
Read 135842 sentence pairs
Trimmed to 135842 sentence pairs
Counting words...
Counted words:
fra 21335
eng 13044


In [None]:
(en_vec, fr_vec), (en, fr) = dataset[15]
en, fr, en_vec.shape, fr_vec.shape

('je l ai emporte !', 'i won !', torch.Size([512]), torch.Size([512]))

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64)

In [None]:
for (en_vec, fr_vec), (en, fr) in dataloader:
  print(en_vec.shape, fr_vec.shape)
  break

torch.Size([64, 512]) torch.Size([64, 512])


In [None]:
input_lang.n_words, output_lang.n_words

(21335, 13044)

In [None]:
embedding_size = 300
hidden_size = 256
num_layers = 2
p = 0.5
embedding = nn.Embedding(input_lang.n_words, embedding_size, padding_idx=PAD_token)
dropout = nn.Dropout(p)
lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

In [None]:
en_vec = en_vec.squeeze().permute(1, 0)
# en_vec.shape = (seq_len, batch_size)
en_vec.shape

torch.Size([512, 64])

In [None]:
with torch.no_grad():
  embed = embedding(en_vec)
  outputs, (hidden, cell) = lstm(dropout(embed))
  print(embed.shape, outputs.shape, hidden.shape, cell.shape)

torch.Size([512, 64, 300]) torch.Size([512, 64, 256]) torch.Size([2, 64, 256]) torch.Size([2, 64, 256])


## Model

### Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    '''
    Args:
      input_size: size of Vocabulary
      embedding_size: size of vec for word2vec
      hidden_size: 1024
      num_layers: 2
      p: dropout rate = 0.5
    '''
    super(Encoder, self).__init__()
    self.dropout = nn.Dropout(p)
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size) # output can be (batch, sent_len, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
  
  def forward(self, x):
    '''
    Args:
      x: has shape = (seq_len, batch_size)

    Return:
      hidden: shape = (D∗num_layers, batch_size, hidden_size if proj_size<=0 else proj_size)
      cell: shape = (D∗num_layers, bact_size, hidden_size)
    '''
    # print(f'Encoder\t x.shape = {x.shape} \t expect (512, batch_size)')
    embedding = self.dropout(self.embedding(x))
    # print(f'Encoder\t embedding.shape = {embedding.shape} \t expect (512, batch_size, 300)')

    # embedding shape = (seq_len, batch_size, embedding_size)
    # LSTM input: shape = (seq_len, batch_size, input_size)
    outputs, (hidden, cell) = self.rnn(embedding) # outputs shape: (seq_length, N, hidden_size)
    # print(f'Encoder\t hidden.shape = {hidden.shape} \t expect ({self.num_layers}, batch_size, {self.hidden_size})')
    # print(f'Encoder\t cell.shape = {cell.shape} \t expect ({self.num_layers}, batch_size, {self.hidden_size})')

    return hidden, cell # error in return shape (expect 2D)

### Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
    '''
    input_size: size of Vocabulary
    embedding_size: size of vec for word2vec
    hidden_size: same as in Encoder
    output_size: size of Eng vocab (in case of Ger -> Eng)
    num_layers:
    p: dropout rate
    '''
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(p)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x, hidden, cell):
    '''
    Args:
      x: shape = (batch_size) because we input 1 word each time
      hidden: shape = (D * num_layers, hidden_size)
      cell: current state (for next pred)
    
    Return:
      pred: shape = (batch_size, target_vocab_len)
      hidden, cell: state for next pred
    '''
    # print(f'Decoder\tx.shape = {x.shape} \t expect (batch_size)')
    x = x.unsqueeze(0)  # shape = (1, batch_size) = (seq_len, batch_size) since we use a single word and not a sentence
    # print(f'Decoder\tx.shape = {x.shape} \t expect (1, batch_size)')
    
    embedding = self.dropout(self.embedding(x)) # embedding shape = (1, batch_size, embedding_size)
    # print(f'Decoder\t embedding.shape = {embedding.shape} \t expect (1, batch_size, 300)')
    # print(f'Decoder\t hidden.shape = {hidden.shape} \t cell.shape = {cell.shape}')
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell)) # outputs shape = (1, batch_size, hidden_size)
    # print(f'Decoder\t outputs.shape = {outputs.shape} \t expect (1, batch_size, {self.hidden_size})')

    predictions = self.fc(outputs)  # predictions.shape = (1, batch_size, vocab_len)
    predictions = predictions.squeeze(0)  # predictions.shape = (batch_size, target_vocab_len) to send to loss func
    # print(f'Decoder\t predictions.shape = {predictions.shape} \t expect (batch_size, target_vocab_len)')
    return predictions, hidden, cell

### Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder: torch.nn.Module, decoder: torch.nn.Module):
    super(Seq2Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_force_ratio=0.5):
    '''
    source: shape = (src_len, batch_size)
    target: shape = (target_len, batch_size)
    teacher_force_ratio: ratio b/w choosing predicted and ground_truth word to use as input for next word prediction
    '''
    batch_size = source.shape[1]  # need modification
    target_len = target.shape[0]  # need modification
    target_vocab_size = output_lang.n_words  # need modification (len of target vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device) # use as output prediction, init w/ zeros

    hidden, cell = self.encoder(source)

    # Grab the first input to the Decoder which will be <SOS> token
    x = target[0]
    # print(f'Seq2Seq\t start x.shape = {x.shape} \t expect (batch_size)')
    for t in range(1, target_len):
      # Use previous hidden, cell as context from encoder at start
      output, hidden, cell = self.decoder(x, hidden, cell)
      # output.shape = (batch_size, target_vocab_len)
      
      # print(f'Seq2Seq\t output.shape = {output.shape} \t expect (batch_size, target_vocab_len)')

      # Store next output prediction
      outputs[t] = output

      # Get the best word the Decoder predicted (index in the vocabulary)
      best_guess = output.argmax(1) # best_guess.shape = (batch_size)
      # print(f'Seq2Seq\t best_guess.shape = {best_guess.shape} \t expect (batch_size)')

      # With probability of teacher_force_ratio we take the actual next word
      # otherwise we take the word that the Decoder predicted it to be.
      # Teacher Forcing is used so that the model gets used to seeing
      # similar inputs at training and testing time, if teacher forcing is 1
      # then inputs at test time might be completely different than what the
      # network is used to. This was a long comment.
      x = target[t] if random.random() < teacher_force_ratio else best_guess

    return outputs

## Training

In [None]:
input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

Reading lines...
Read 135842 sentence pairs
Trimmed to 135842 sentence pairs
Counting words...
Counted words:
fra 21335
eng 13044


In [None]:
# Training hyperparameters
num_epochs = 50
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = input_lang.n_words
input_size_decoder = output_lang.n_words
output_size = output_lang.n_words
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 256  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [None]:
start_id = 500
data_len = int(6400/2)
pairs = pairs[start_id : start_id + data_len]
dataset = MyDataset(input_lang, output_lang, pairs)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64)

In [None]:
for (en_vec, fr_vec), (en, fr) in dataloader:
  print(en_vec.shape, fr_vec.shape)
  break

torch.Size([64, 512]) torch.Size([64, 512])


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_token)

In [None]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)

In [None]:
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

In [None]:
model = Seq2Seq(encoder_net, decoder_net).to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
  # print(f"[Epoch {epoch} / {num_epochs}]")
  model.train()
  total_loss = 0.0
  for batch_idx, ((en_vec, fr_vec), (en, fr)) in tqdm(enumerate(dataloader), total=len(dataloader)):
    en_vec, fr_vec = en_vec.permute(1, 0), fr_vec.permute(1, 0)
    en_vec = en_vec.to(device)
    fr_vec = fr_vec.to(device)

    # Forward prop
    output = model(en_vec, fr_vec)

    # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
    # doesn't take input in that form. For example if we have MNIST we want to have
    # output to be: (N, 10) and targets just (N). Here we can view it in a similar
    # way that we have output_words * batch_size that we want to send in into
    # our cost function, so we need to do some reshapin. While we're at it
    # Let's also remove the start token while we're at it
    output = output[1:].reshape(-1, output.shape[2])  # shape = (trg_len * batch_size, output_dim)
    target = fr_vec[1:].reshape(-1) # shape = (trg_len * batch_size)
    # output[1:]: ignore SOS_token

    optimizer.zero_grad()
    loss = criterion(output, target)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()
    
    total_loss += loss.item()
  print(f"EPOCH = {epoch} \t loss = {total_loss/len(dataloader)}")

In [None]:
# 100%|██████████| 50/50 [07:26<00:00,  8.94s/it]
# EPOCH = 0 	 loss = 5.383958940505981
# 100%|██████████| 50/50 [07:25<00:00,  8.90s/it]
# EPOCH = 1 	 loss = 3.7594539594650267
# 100%|██████████| 50/50 [07:25<00:00,  8.90s/it]
# EPOCH = 2 	 loss = 3.5973013925552366
# 100%|██████████| 50/50 [07:24<00:00,  8.90s/it]
# EPOCH = 3 	 loss = 3.478486728668213
# 100%|██████████| 50/50 [07:25<00:00,  8.90s/it]
# EPOCH = 4 	 loss = 3.4210835123062133
# 100%|██████████| 50/50 [07:24<00:00,  8.88s/it]
# EPOCH = 5 	 loss = 3.356914539337158
# 100%|██████████| 50/50 [07:24<00:00,  8.90s/it]
# EPOCH = 6 	 loss = 3.2783570289611816
# 100%|██████████| 50/50 [07:23<00:00,  8.87s/it]
# EPOCH = 7 	 loss = 3.2372921180725096
# 100%|██████████| 50/50 [07:24<00:00,  8.89s/it]
# EPOCH = 8 	 loss = 3.199459252357483
# 100%|██████████| 50/50 [07:24<00:00,  8.89s/it]
# EPOCH = 9 	 loss = 3.1788624906539917
# 100%|██████████| 50/50 [07:23<00:00,  8.87s/it]
# EPOCH = 10 	 loss = 3.1831029272079467
# 100%|██████████| 50/50 [07:24<00:00,  8.88s/it]
# EPOCH = 11 	 loss = 3.1025032949447633
# 100%|██████████| 50/50 [07:24<00:00,  8.88s/it]
# EPOCH = 12 	 loss = 3.1095044803619385
# 100%|██████████| 50/50 [07:23<00:00,  8.86s/it]
# EPOCH = 13 	 loss = 3.0869728183746337
# 100%|██████████| 50/50 [07:22<00:00,  8.86s/it]
# EPOCH = 14 	 loss = 3.08543803691864
# 100%|██████████| 50/50 [07:22<00:00,  8.84s/it]
# EPOCH = 15 	 loss = 3.0724154567718505
# 100%|██████████| 50/50 [07:21<00:00,  8.83s/it]
# EPOCH = 16 	 loss = 3.0756219959259035
# 100%|██████████| 50/50 [07:22<00:00,  8.85s/it]
# EPOCH = 17 	 loss = 3.017907304763794
# 100%|██████████| 50/50 [07:21<00:00,  8.84s/it]
# EPOCH = 18 	 loss = 3.0001839065551756
# 100%|██████████| 50/50 [07:22<00:00,  8.85s/it]
# EPOCH = 19 	 loss = 3.021025981903076
# 100%|██████████| 50/50 [07:22<00:00,  8.84s/it]
# EPOCH = 20 	 loss = 2.9646836137771606
# 100%|██████████| 50/50 [07:22<00:00,  8.84s/it]
# EPOCH = 21 	 loss = 2.98474328994751
#  28%|██▊       | 14/50 [02:12<05:40,  9.47s/it]


## Eval

In [None]:
def translate_sentence(model, en_vec, output_lang, device, max_length=50):
  model.eval()
  vec = en_vec[0]
  vec = vec.unsqueeze(0)
  
  # Build encoder hidden, cell state
  with torch.no_grad():
      hidden, cell = model.encoder(vec)

  outputs = [SOS_token]

  for _ in range(max_length):
      previous_word = torch.LongTensor([outputs[-1]]).to(device)

      with torch.no_grad():
          output, hidden, cell = model.decoder(previous_word, hidden, cell)
          best_guess = output.argmax(1).item()

      outputs.append(best_guess)

      # Model predicts it's the end of the sentence
      if output.argmax(1).item() == EOS_token:
          break
  
  print(outputs)
  translated_sentence = [output_lang.index2word[idx] for idx in outputs]

  # remove start token
  return translated_sentence

In [39]:
testset = MyDataset(input_lang, output_lang, pairs[-651:])
testloader = torch.utils.data.DataLoader(testset, batch_size=1)

In [53]:
def translate_sentence(model, en_vec, output_lang, device, max_length=50):
  model.eval()
  vec = en_vec[0]
  vec = vec.unsqueeze(0)
  vec = vec.permute(1, 0)
  # print(vec.shape)

  # Build encoder hidden, cell state
  with torch.no_grad():
      hidden, cell = model.encoder(vec)
      # print(hidden.shape, cell.shape)

  outputs = [SOS_token]

  for _ in range(max_length):
      previous_word = torch.LongTensor([outputs[-1]]).to(device)

      with torch.no_grad():
          output, hidden, cell = model.decoder(previous_word, hidden, cell)
          best_guess = output.argmax(1).item()

      outputs.append(best_guess)

      # Model predicts it's the end of the sentence
      if output.argmax(1).item() == EOS_token:
          break

  translated_sentence = [output_lang.index2word[idx] for idx in outputs]
  print(translated_sentence)
  return translated_sentence

In [56]:
for idx, ((en_vec, fr_vec), (en, fr)) in enumerate(testloader):
  # print(en_vec.shape, fr_vec.shape)
  # print(en[0], en_vec[0][:10])
  print(fr[0])
  translate_sentence(model, en_vec, output_lang, device, max_length=50)
  if idx==50: break

i must hurry .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i must leave .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i must leave .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i must study .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a car .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a hug .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a job .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a job .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a job .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a map .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need a pen .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need money .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need paint .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need paint .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need proof .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need proof .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need sleep .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need space .
['<SOS>', 'i', 'm', '.', '.', '<EOS>']
i need sugar .
['<SOS>', 'i'