In [None]:
!nvidia-smi

Thu Nov 12 02:32:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* @file NLP進階 / Seq2seq
  * @brief Seq2seq 模型實作 

  * 此份程式碼是以教學為目的，附有完整的架構解說。

  * @author 人工智慧科技基金會 AI 工程師 - 康文瑋
  * Email: run963741@aif.tw
  * Resume: https://www.cakeresume.com/run963741

  * 最後更新日期: 2020/11/13

# Sequence to sequence (Seq2seq)

[Sequence to sequence](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf) (序列到序列) 模型是在 2014 年由 Google 團隊的 Sutskever 等人所提出，這個架構首次將類神經網路應用在機器翻譯任務上，造成了不小的轟動，此架構成為往後自然語言處理領域的各種模型的基礎架構，許多知名的模型都是由此架構下去作延伸修改。

Seq2seq 常見的應用場景例如：
* 機器翻譯 (Machine Translation)：語言之間的翻譯，通常兩語言的序列長度通常會不一樣。
* 文本摘要 (Text Summarization)：從文本中萃取最重要的摘要，兩者的序列長度一定不一樣。
* 聊天機器人 (Chatbot): 聊天一來一往的句子長度通常會不一樣。

<figure>
<center>
<img src='https://drive.google.com/uc?export=view&id=19OpGD0XFy1W-IOmBaPOMfv195vU-rr4O' width="800"/>
<figcaption></figcaption></center>
</figure>

# Environment

#### - Tensorflow 2.3.0
#### - python3.7

# 載入套件

In [None]:
import tensorflow_datasets as tfds
import tensorflow_addons as tfa
import tensorflow as tf
import os
import tqdm
import unicodedata
import re
import io
from pprint import pprint
from sklearn.model_selection import train_test_split

import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

os.chdir('/content/drive/Shared drives/類技術班教材/標準版/NLP進階/Seq2seq 系列模型/Sequence_to_sequence')
print(tf.__version__)

2.3.0


# 建立資料夾路徑

`en_vocab_file`: 儲存英文字典 (vocabulary) 路徑

`sp_vocab_file`: 儲存西文字典 (vocabulary) 路徑

`checkpoint_path`: 儲存模型路徑

`download_dir`: 資料儲存路徑

In [None]:
output_dir = "nmt_seq2seq"
en_vocab_file = os.path.join(output_dir, "en_vocab")
sp_vocab_file = os.path.join(output_dir, "sp_vocab")
download_dir = "tensorflow-datasets/downloads"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# 下載 `Tensorflow` 範例資料集

下載英文和西班牙文的範例資料集。

In [None]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True, cache_dir = download_dir)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

print('--'*20)
print('資料路徑: ', path_to_file)

----------------------------------------
資料路徑:  tensorflow-datasets/downloads/datasets/spa-eng/spa.txt


# 資料前處理

## 字串處理

在 Unicode 中，某些字符能夠用多種合法的底層編碼，例如在以下範例中，西班牙文字 $\tilde{n}$ 可以由兩種編碼來表示，這種情況會導致後續建立語言模型時產生問題，所以我們要使用 `unicodedata` 將文字做標準化。

In [None]:
s1 = 'Spicy Jalape\u00f1o'
s2 = 'Spicy Jalapen\u0303o'

print('s1: ', s1)
print('s2: ', s2)
print('s1 和 s2 是否一樣: ', s1 == s2)

s1:  Spicy Jalapeño
s2:  Spicy Jalapeño
s1 和 s2 是否一樣:  False


### `unicodedata` 標準化範例

In [None]:
# https://python3-cookbook.readthedocs.io/zh_CN/latest/c02/p09_normalize_unicode_text_to_regexp.html
s1_normalized = unicodedata.normalize('NFD', s1)
s2_normalized = unicodedata.normalize('NFD', s2)

print('s1: ', s1_normalized)
print('s2: ', s2_normalized)
print('s1 和 s2 是否一樣: ', s1_normalized == s2_normalized)

s1:  Spicy Jalapeño
s2:  Spicy Jalapeño
s1 和 s2 是否一樣:  True


## 字串處理函數

In [None]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn') # Mn 判斷是否為 Nonspacing

def preprocess_sentence(w):
  w = unicode_to_ascii(w.lower().strip())

  # 正則表達式: https://www.runoob.com/regexp/regexp-syntax
  # http://ccckmit.wikidot.com/regularexpression
  w = re.sub(r"([?.!,¿])", r"\1", w)
  w = re.sub(r'[" "]+', " ", w)
  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()
  return w

In [None]:
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  
  # 使用 /t 把英文和西文分開
  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  return zip(*word_pairs)

## 讀取資料集

In [None]:
en, sp = create_dataset(path_to_file, None)

In [None]:
print(en[20])
print(sp[20])

wait.
esperen.


## 切割訓練集 (Training) 和測試集 (Testing)

In [None]:
en_train, en_test, sp_train, sp_test = train_test_split(en, sp, test_size = 0.1, shuffle = True)

train_examples = tf.data.Dataset.from_tensor_slices((en_train, sp_train))
test_examples = tf.data.Dataset.from_tensor_slices((en_test, sp_test))

In [None]:
print('Train size: ', len(en_train))
print('Test size: ', len(en_test))

Train size:  107067
Test size:  11897


In [None]:
# http://ez2learn.com/basic/unicode.html
tmp_en, tmp_sp = next(iter(train_examples))

print('Input english: ', tmp_en)
print('Output spanish: ', tmp_sp)

Input english:  tf.Tensor(b'he sells cars.', shape=(), dtype=string)
Output spanish:  tf.Tensor(b'el vende carros.', shape=(), dtype=string)


## 使用`tfds.deprecated.text.SubwordTextEncoder`載入與建立字典

* `.load_from_file`: 從路徑載入字典
* `.build_from_corpus`: 建立字典
* `.save_to_file`: 儲存字典

In [None]:
%%time
try:
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.load_from_file(en_vocab_file) 
    print('Load English vocabulary: %s' % en_vocab_file)
except:
    print('Build English vocabulary: %s' % en_vocab_file)
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((en.numpy() for en, sp in train_examples),target_vocab_size = 2**13)
    tokenizer_en.save_to_file(en_vocab_file)

Load English vocabulary: nmt_seq2seq/en_vocab
CPU times: user 32.5 ms, sys: 6.06 ms, total: 38.5 ms
Wall time: 629 ms


In [None]:
%%time
try: 
    tokenizer_sp = tfds.deprecated.text.SubwordTextEncoder.load_from_file(sp_vocab_file) 
    print('Load Spanish vocfabulary: %s' % sp_vocab_file)
except: 
    print('Build Spanish vocabulary: %s' % sp_vocab_file)
    tokenizer_sp = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((sp.numpy() for en, sp in train_examples), target_vocab_size = 2**13)
    tokenizer_sp.save_to_file(sp_vocab_file)

Load Spanish vocfabulary: nmt_seq2seq/sp_vocab
CPU times: user 32.1 ms, sys: 4.72 ms, total: 36.8 ms
Wall time: 613 ms


### 字典大小以及 `subword`

In [None]:
print('English vocabulary size: ', tokenizer_en.vocab_size)
print('Spanish vocabulary size: ', tokenizer_sp.vocab_size)
print('-' * 30)
print('English subwords: ', tokenizer_en.subwords[:10])
print('Spanish subwords: ', tokenizer_sp.subwords[:10])

English vocabulary size:  8260
Spanish vocabulary size:  8078
------------------------------
English subwords:  ['i_', 'the_', 'to_', 'you_', 'tom_', 'a_', 't_', 'is_', 'he_', 's_']
Spanish subwords:  ['que_', 'de_', 'el_', 'a_', 'no_', 'la_', 'tom_', '¿', 'en_', 'es_']


### Example

英文的斷詞方式是以 [`wordpiece`](https://arxiv.org/pdf/1609.08144.pdf) 進行斷詞。

In [None]:
sample_string = 'Transformer is awesome.'

tokenized_string_token = tokenizer_en.encode(sample_string)
print ('Tokenized string token is \n{}'.format(tokenized_string_token))

print('-'*20)
tokenized_string = [tokenizer_en.decode([ts]) for ts in tokenized_string_token]
print('Tokenized srting is \n{}'.format(tokenized_string))

print('-'*20)
original_string = tokenizer_en.decode(tokenized_string_token)
print ('The original string: \n{}'.format(original_string))

assert original_string == sample_string

Tokenized string token is 
[8088, 2692, 8119, 6761, 8, 4368, 8050]
--------------------
Tokenized srting is 
['T', 'ran', 's', 'former ', 'is ', 'awesome', '.']
--------------------
The original string: 
Transformer is awesome.


## 添加`<BOS>`,`<EOS>`在句子頭尾

<figure>
<center>
<img src='https://drive.google.com/uc?export=view&id=11_d2u6W8t3qi_T8x22jKQ-MYnBNM6-JW' width="500"/>
<figcaption>Auto Regressive</figcaption></center>
</figure>

Seq2seq 模型的訓練方式以及預測方式都是使用 Auto Regressive 的模式來進行，將當前時間點 $T_1$ 的預測值接在下一個時間點 $T_2$ 的後面，再輸入給模型，直到模型預測出 $<EOS>$ 為止。

* $<BOS>$: 全名為 Begin of sentence，因為第一個時間點 $T_1$ 翻譯時不可能馬上有正確答案，所以會統一輸入 $<BOS>$ ，例如上圖的要完整預測出 $文瑋助教真帥$，第一個時間點 $T_1$ 還沒有 $文$ 這個字，所以使用 $<BOS>$ 來作為輸入。

* $<EOS>$: 全名為 End of sentence，當模型預測出這個 token 時，就代表整個序列預測完畢，如果前處理沒有加上 $<EOS>$，模型就會永無止盡的預測下去。

In [None]:
def encode(en_t, sp_t):
    """
    這邊將 `.vocab_size`視為`<BOS>`, `.vocab_size+1`視為`<EOS>`
    訓練集所有句子都需要進行這一步前處理
    """
    en_indics = [tokenizer_en.vocab_size] + tokenizer_en.encode(en_t.numpy()) + [tokenizer_en.vocab_size + 1]
    sp_indics = [tokenizer_sp.vocab_size] + tokenizer_sp.encode(sp_t.numpy()) + [tokenizer_sp.vocab_size + 1]

    return en_indics, sp_indics

In [None]:
en_t, sp_t = next(iter(train_examples))
en_indics, sp_indics = encode(en_t, sp_t)

print('英文<BOS>: %d' % tokenizer_en.vocab_size)
print('英文<EOS>: %d' % (tokenizer_en.vocab_size + 1))
print('西文<BOS>: %d' % tokenizer_sp.vocab_size)
print('西文<EOS>: %d' % (tokenizer_sp.vocab_size + 1))

print('-' * 20)
print('Before encode: (two tensor):')
print(en_t)
print(sp_t)
print()
print('After encode: (two array): ')
print(en_indics)
print(sp_indics)

英文<BOS>: 8260
英文<EOS>: 8261
西文<BOS>: 8078
西文<EOS>: 8079
--------------------
Before encode: (two tensor):
tf.Tensor(b'he sells cars.', shape=(), dtype=string)
tf.Tensor(b'el vende carros.', shape=(), dtype=string)

After encode: (two array): 
[8260, 9, 2684, 1892, 8050, 8261]
[8078, 3, 3470, 1836, 7937, 7868, 8079]


### `tf.py_function`

在 Tensorflow 的訓練過程中，所有的計算過程都必須使用 `tf.` 來達成，當有某一些函數操作不涉及到 `tf.` 時，就必須使用 `tf.py_function` 來將函數納入 tensorflow 的計算流程裡面。

In [None]:
# import traceback

# try:
#     train_examples.map(encode)
# except AttributeError:
#     traceback.print_exc()

In [None]:
def tf_encode(en_t, sp_t):
    """
    使用 tf.py_function 將 encode 函數轉換為 tensorflow 的輸入與輸出
    """
    return tf.py_function(encode, [en_t, sp_t], [tf.int64, tf.int64])

tmp_dataset = train_examples.map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
en_indices, sp_indices = next(iter(tmp_dataset))

print('After tf_encode: (two tensor)')
print(en_indices)
print(sp_indices)

After tf_encode: (two tensor)
tf.Tensor([8260    2  693  245 1525    3    2 1820 8050 8261], shape=(10,), dtype=int64)
tf.Tensor([8078  230    6  425  383    2  456   24    6 2001 7868 8079], shape=(12,), dtype=int64)


## 限制句子長度

為了加快訓練速度，使用`tf.logical`限制句子長度，並使用`.filter`過濾

In [None]:
max_length = 50
def filter_max_length(en_t, sp_t, max_length = max_length):
    
    return tf.logical_and(tf.size(en_t) <= max_length,
                          tf.size(sp_t) <= max_length)

tmp_dataset = tmp_dataset.filter(filter_max_length)

## Padding

指定 `batch_size` 以及 `padding`，`padding` 會先檢查一個 `batch` 裡面的句子長度，不足最大長度的句子會補 `0` (`padding index`)，因為預設是補 `0` 的關係，所以字典中的  `word index` 必須從 `1` 開始計算，不然會跟 `padding index` 混淆。

In [None]:
batch_size = 64
tmp_dataset = tmp_dataset.padded_batch(batch_size=batch_size, padded_shapes=([-1], [-1]))

en_batch, sp_batch = next(iter(tmp_dataset))

print('英文 batch: ')
print(en_batch)
print('-' * 15)
print('西文 batch: ')
print(sp_batch)

英文 batch: 
tf.Tensor(
[[8260    2  693 ...    0    0    0]
 [8260   27    4 ...    0    0    0]
 [8260   20 6620 ...    0    0    0]
 ...
 [8260    9 2129 ...    0    0    0]
 [8260    1   35 ...    0    0    0]
 [8260   58    2 ...    0    0    0]], shape=(64, 20), dtype=int64)
---------------
西文 batch: 
tf.Tensor(
[[8078  230    6 ...    0    0    0]
 [8078    8   63 ...    0    0    0]
 [8078   14 5860 ...    0    0    0]
 ...
 [8078    3 5315 ...    0    0    0]
 [8078   45  460 ...    0    0    0]
 [8078   64   21 ...    0    0    0]], shape=(64, 20), dtype=int64)


# 將`train_examples`與`val_examples`做同樣處理

* `train`:

 - `map(tf_encode)`: 將字串轉成index
 - `filter(filter_max_length)`:過濾長度
 - `cache()`: cache the dataset to memory to get a speedup while reading from it.
 - `shuffle(buffer_size)`: 打亂buffer裡的資料，確保隨機
 - `padded_batch(batch_size, padded_shapes=([-1],[-1]))`: padding長度 

Tensor-core pipeline: https://www.tensorflow.org/guide/performance/datasets?hl=zh_cn

In [None]:
max_length = 50
batch_size = 64
buffer_size = 15000

train_dataset = (train_examples
                 .map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
                 .filter(filter_max_length)
                 .cache()
                 .shuffle(buffer_size)
                 .padded_batch(batch_size, padded_shapes=([-1],[-1]))
                 .prefetch(tf.data.experimental.AUTOTUNE))

test_dataset = (test_examples
               .map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
               .filter(filter_max_length)
               .padded_batch(batch_size, padded_shapes=([-1], [-1])))

In [None]:
%%time
en_batch, sp_batch = next(iter(train_dataset))

print('英文 batch tensor: ')
print(en_batch)
print('-' * 20)
print('西文 batch tensor: ')
print(sp_batch)

英文 batch tensor: 
tf.Tensor(
[[8260    1   35 ...    0    0    0]
 [8260   49   14 ...    0    0    0]
 [8260   27    4 ...    0    0    0]
 ...
 [8260   37   58 ... 1060 8050 8261]
 [8260   21    8 ...    0    0    0]
 [8260    5  117 ... 8050 8261    0]], shape=(64, 16), dtype=int64)
--------------------
西文 batch tensor: 
tf.Tensor(
[[8078   45    1 ...    0    0    0]
 [8078  968   18 ...    0    0    0]
 [8078    8   99 ...    0    0    0]
 ...
 [8078    5   64 ... 1231 7868 8079]
 [8078   25   10 ...    0    0    0]
 [8078    7   13 ... 8079    0    0]], shape=(64, 17), dtype=int64)
CPU times: user 9.86 s, sys: 1.62 s, total: 11.5 s
Wall time: 6.94 s


# Seq2seq

架構如下圖，Seq2seq 由編碼器 (Encoder) 以及 解碼器 (Decoder) 所組成，編碼器和解碼器個別都是一個 LSTM，當然也能替換成其他序列模型，例如 RNN、GRU等等 ：

假設有一個序列：
$$
X=\{x_1,x_2,...,x_n\}
$$

* 編碼器 (Encoder): 負責接收輸入序列，序列通過 LSTM 之後，在每一步都會產生一個 hidden state $h_t$ 以及一個 cell memory $c_t$，以 seq2seq 來說，會使用最後一步的 hidden state $h_n$ 來作為解碼器 (Decoder) 的輸入，例如以我們的例子英中翻譯來說，輸入給編碼器的句子就是英文句子，當編碼器處理完英文句子之後，會將最後濃縮的資訊丟給解碼器。
$$
h_t, c_t=LSTM(x_t,h_{t-1},c_{t-1})
$$

* 解碼器 (Decoder): 負責接收編碼器的輸入以及輸出，第一個時間點輸入給 LSTM，之後的輸出方式就是 auto regressive，直到模型輸出 $<EOS>$ 為止，這個方式上面介紹過了。
$$
\hat{h}_t, \hat{c}_t=LSTM(y_t,\hat{h}_{t-1},\hat{c}_{t-1})
$$

<figure>
<center>
<img src='https://drive.google.com/uc?export=view&id=1AMh4ZXSgXlLrkLPtXOaJXHir4Qt5qUfs' width="700"/>
<figcaption>Sequence to sequence</figcaption></center>
</figure>



## 基本參數設置

In [None]:
vocab_inp_size = tokenizer_en.vocab_size + 2
vocab_tar_size = tokenizer_sp.vocab_size + 2

embedding_dim = 512
units = 1024
batch_size = 64
epochs = 20
learning_rate = 0.001

以下程式碼分為兩個主要部分:

1. `class Encoder()`: 通常使用`RNN`系列的模型，例如`RNN`、`LSTM`、`GRU`等等，頂多再加上雙向的架構，所有的`RNN`模型在每個`timestep`都會產生一個`hidden state`，最後一個`timestep`的`hidden state`會當成`Decoder`的`initial state`，也可以稱為`Context vector`。

2. `class Decoder()`: 負責接收`Encoder`的`Context vector`，然後輸入給 `RNN`，然後輸出方式依照`Auto regressive`方式輸出。

## Encoder

 - `return_sequences`：是否返回所有的`timestep`的`hidden_state`。
 - `return_state`：是否返回最後一個`timestep`的`cell_state`，注意只有`LSTM`有`cell_state`，其餘像`RNN,GRU`是沒有`cell_state`的。

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_inp_size, embedding_dim, units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.units = units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_inp_size, output_dim=embedding_dim)
        self.lstm = tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        
    def __call__(self, x):
        """
        LSTM需要initial_state與initial_cell兩種初始值，這邊都使用同一個initial_hidden_state作為初始值
        若使用GRU則只需要一個initial_state即可
        """
        x = self.embedding(x)
        states, last_state, cell_memory = self.lstm(x)
        return states, last_state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)

initial_hidden = encoder.initialize_hidden_state()
states, last_state = encoder(en_batch)
print('Encoder hidden state for each timestep: ',states.shape) # (batch_size, sequence_length, units)
print('Encoder last hidden state: ',last_state.shape) # (batch_size, sequence_length, units)

Encoder hidden state for each timestep:  (64, 16, 1024)
Encoder last hidden state:  (64, 1024)


## Decoder

透過`Encoder`得到`context vector`之後，`Decoder`的作用就是決定如何串接`Encoder`的`hidden state`以及進行輸出預測，這邊直接把`Context vector`當成`initial state`。

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_tar_size, embedding_dim, dec_units):
    super().__init__()
    # 同上式中的 x
    self.embedding = tf.keras.layers.Embedding(vocab_tar_size, embedding_dim)
    # 同上式中的 lstm
    self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    # 同上式中的 Dense
    self.fc = tf.keras.layers.Dense(vocab_tar_size)

  def __call__(self, x, hidden):

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # passing the concatenated vector to the LSTM
    # output shape == (batch_size, 1, dec_units)
    # state shape == (batch_size, dec_units)
    # cell_memory shape == (batch_size, dec_units)
    output, state, cell_memory = self.lstm(x, initial_state = hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.squeeze(output, axis=1)

    # output shape == (batch_size, vocab_size)
    logits = self.fc(output)

    return logits, hidden

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units)
logits, hidden = decoder(sp_batch[:,-1:], [last_state, last_state])

print('logits shape: ', logits.shape) # (batch_size, vocab_tar_size)
print('last_state shape: ', hidden[0].shape) # (batch_size, dec_units)
print('cell memory shape: ', hidden[1].shape) # (batch_size, dec_units)

logits shape:  (64, 8080)
last_state shape:  (64, 1024)
cell memory shape:  (64, 1024)


## Loss and metrics

這裏定義損失函數，當 `Decoder` 預測出每個位置的詞時需要計算損失，這邊使用分類任務的損失函數 `CategoricalCrossentropy`。

部分句子為因為 `padding` 而有許多的 `0`，但是那並不是真實的標籤，所以必須忽略 `padding` 位置的損失。


In [None]:
def loss_function(real, pred):
    """
    Input:
    real: (batch_size, 1)
    pred: (batch_size, vocab_tar_size)
    
    Return: 
    mean loss for current batch
    
    mask: (batch_size, 1)
    因為一個batch中有些句子會提早出現padding(index為0)，不需要計算 padding 的 loss，這裏使用 mask 來記錄 padding 的 index
    """
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    """
    from_logits: y_pred is expected to be a logits tensor. By default, we assume that y_pred encodes a probability distribution.
    reduction: the reduction schedule of output loss vectors. `https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/losses/Reduction`
    """
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    
    """
    loss_: (batch_size, 1)
    """
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask # 只計算非padding的loss
    
    return tf.reduce_mean(loss_)

In [None]:
real = tf.constant([[0.],[1.]], dtype=tf.float32)
pred = tf.constant([[0.3,0.2,0.5],[0,1,0]], dtype=tf.float32)

mean_loss = loss_function(real, pred)
print('mean loss for current batch: ', mean_loss.numpy())

mean loss for current batch:  0.42281893


## Optimizer

優化器通常會優先考慮 `Adam`。

近期亦有許多表現不錯的優化器被研究出來，例如 `RangerLars` 在圖像任務上表現上就比 `Adam` 還要好。

優化器表現可以參考: https://github.com/mgrankin/over9000

In [None]:
# optimizer = tf.keras.optimizers.Adam(learning_rate)

In [None]:
radam = tfa.optimizers.RectifiedAdam()
# ranger
optimizer = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)

## Checkpoints

這裏定義儲存模型的方式:

* `checkpoint_path`: 模型儲存路徑
* `ckpt`: 模型中的架構
* `ckpt_manager`: 模型儲存的策略，包含了架構 (`ckpt`), 路徑 (`checkpoint_path`), 儲存最近幾次 (`max_to_keep`)。

In [None]:
model_name = 'checkpoints_seq2seq'

encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_tar_size, embedding_dim, units)

checkpoint_path = os.path.join(output_dir, model_name)
ckpt = tf.train.Checkpoint(encoder = encoder, decoder = decoder, optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

## Train step

這邊設定輸入一個英文句子進行訓練的方式，大致上的步驟為:

1. 輸入句子給 `encoder` 獲得 `hidden state`，準備給 `decoder` 計算 `attention`。
2. 定義初始輸入 `<BOS>`。

**以下進入迴圈**

3. 使用 `<BOS>` 輸入給 `decoder` 獲得一個預測詞。
4. 預測詞與真實詞計算損失。
5. 使用 `tearch forcing` 策略，將真實詞丟回給模型繼續預測，而不是把預測詞丟回給模型。

**以上為迴圈**

6. 拿出模型所有參數以及損失，使用 `optimizer` 進行梯度下降。


In [None]:
def train_step(inp, tar, decoder):

    loss = 0
    """
    使用 with tf.GradientTape() 來告訴 tensorflow 以下的執行過程都會涉及到梯度的更新
    """
    with tf.GradientTape() as tape:
        # 首先產生 encoder 所有的 hidden states
        states, last_state = encoder(inp)
        
        # 使用 encoder 的最後一步 last_state 作為 decoder 的 initial state
        hidden_state = [last_state, last_state]
        
        # 因為一開始還沒有正確答案，所以必須輸入給 decoder 一個起始的 token
        # tokenizer_sp.vocab_size: <BOS> token
        dec_input = tf.expand_dims([tokenizer_sp.vocab_size] * batch_size, axis=1)
        
        # auto-regressive 迴圈，每次預測出一個詞
        for t in range(1, tar.shape[1]):
            # decoder 進行預測 
            predictions, hidden_state = decoder(dec_input, hidden_state)

            # 預測結果和標籤計算損失
            loss += loss_function(tar[:, t], predictions)
            
            # 使用 teacher forcing 策略，每次輸入給模型真實答案，而不是預測值，這樣做的原因是防止模型一直錯下去
            dec_input = tf.expand_dims(tar[:, t], axis=1)

    # 計算一個 batch 中的平均損失
    batch_loss = (loss / int(tar.shape[1]))

    # 拿出模型所有參數
    variables = encoder.trainable_variables + decoder.trainable_variables

    # 計算模型參數的 gradient
    gradients = tape.gradient(loss, variables)

    # Pre-normalized gradient
    (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=1.0)

    # 使用 gradient 進行梯度下降
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

## Evaluation step

定義評估方式，輸入為英文句子，輸出為預測西文。

基本上與 `train_step` 雷同，差別在於兩個：

1. 沒有進行梯度下降
2. 當模型預測出 `<EOS>` 時，即停止預測。

In [None]:
def evaluate(inp_sentence, decoder):
    
    # 輸入英文句子進行文字前處理
    inp_sentence = preprocess_sentence(inp_sentence)

    # 英文句子轉換為 token
    inp_tokenized = tokenizer_en.encode(inp_sentence)

    # 在英文句子前後加上 <BOS> 以及 <EOS>
    inp_id = [tokenizer_en.vocab_size] + inp_tokenized + [tokenizer_en.vocab_size+1]

    # 新增一個 batch_size 維度，符合模型輸入
    inp_id = tf.expand_dims(inp_id, axis=0)
    
    # 定義 decoder 輸入 <BOS>
    dec_inp = tf.expand_dims([tokenizer_sp.vocab_size] * inp_id.shape[0], axis=0)
    
    # 產生 encoder 所有的 hidden states
    states, last_state = encoder(inp_id)
    hidden_state = [last_state, last_state]
    
    # 紀錄預測結果
    preds_ids = list()
            
    # 此迴圈開始進行預測，直到 max_length 或是預測出 <EOS> 就停止
    for t in range(max_length):
        # decoder 進行預測
        preds, hidden_state = decoder(dec_inp, hidden_state)

        # 使用 argmax 挑出預測概率最高的 token
        preds_id = tf.argmax(preds[0], axis=0).numpy()
        
        # 新增一個 `batch_size` 維度，符合模型輸入
        dec_inp = tf.expand_dims([preds_id], axis=0)
        
        # 紀錄預測結果
        preds_ids.append(preds_id)

        # 如果預測出 <BOS> 就跳出此迴圈
        if preds_id == tokenizer_sp.vocab_size + 1:
          break

    # 去除非 <BOS> 和 <EOS> 的 token
    preds_ids = [ids for ids in preds_ids if ids < tokenizer_sp.vocab_size]
    
    # 將 token 還原成字串
    preds_sent = tokenizer_sp.decode(preds_ids)

    return preds_sent, inp_sentence

## Training

開始訓練過程

In [None]:
for epoch in tqdm.tqdm(tf.range(epochs)):
    start = time.time()
    
    total_loss = 0
    
    for (batch, (inp, tar)) in enumerate(train_dataset):
        # 如果輸入的句子數量不滿一個 `batch_size`，就複製最後一個句子直到達到 `batch_size`
        if inp.shape[0] != batch_size:
            repeats = batch_size - inp.shape[0]
            
            rep_inp = tf.convert_to_tensor(np.repeat(inp[-1:,:], repeats=repeats, axis=0))
            inp = tf.concat([inp, rep_inp], axis=0)
            
            rep_tar = tf.convert_to_tensor(np.repeat(tar[-1:,:], repeats=repeats, axis=0))
            tar = tf.concat([tar,rep_tar], axis=0)
            
        batch_loss = train_step(inp, tar, decoder)
        
        total_loss += batch_loss
        
        if batch % 50 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
            
    if (epoch + 1) % 2 == 0:
        print('Save Model!')
        ckpt_manager.save()

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / (batch+1)))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    result, sentence = evaluate('I am hungry.', decoder)
    print('Input: %s' % (sentence))
    print('Predicted translation: %s' % (result))
    print('-'*20)



  0%|          | 0/20 [00:00<?, ?it/s][A[A

Epoch 1 Batch 0 Loss 3.0197
Epoch 1 Batch 50 Loss 3.0482
Epoch 1 Batch 100 Loss 2.2492
Epoch 1 Batch 150 Loss 2.8754
Epoch 1 Batch 200 Loss 2.9355
Epoch 1 Batch 250 Loss 2.4273
Epoch 1 Batch 300 Loss 2.7904
Epoch 1 Batch 350 Loss 2.2738
Epoch 1 Batch 400 Loss 2.4360
Epoch 1 Batch 450 Loss 1.8827
Epoch 1 Batch 500 Loss 2.4871
Epoch 1 Batch 550 Loss 2.4378
Epoch 1 Batch 600 Loss 2.3619
Epoch 1 Batch 650 Loss 1.5520
Epoch 1 Batch 700 Loss 2.3545
Epoch 1 Batch 750 Loss 2.1930
Epoch 1 Batch 800 Loss 2.3394
Epoch 1 Batch 850 Loss 2.2697
Epoch 1 Batch 900 Loss 2.2092
Epoch 1 Batch 950 Loss 2.2289
Epoch 1 Batch 1000 Loss 1.9253
Epoch 1 Batch 1050 Loss 2.2208
Epoch 1 Batch 1100 Loss 1.8076
Epoch 1 Batch 1150 Loss 1.8301
Epoch 1 Batch 1200 Loss 2.3524
Epoch 1 Batch 1250 Loss 1.6508
Epoch 1 Batch 1300 Loss 2.7199
Epoch 1 Batch 1350 Loss 1.6651
Epoch 1 Batch 1400 Loss 2.2500
Epoch 1 Batch 1450 Loss 2.0988
Epoch 1 Batch 1500 Loss 2.0808
Epoch 1 Batch 1550 Loss 2.1355
Epoch 1 Batch 1600 Loss 1.8538




  5%|▌         | 1/20 [13:53<4:23:52, 833.30s/it][A[A

Input: i am hungry.
Predicted translation: el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es el es 
--------------------
Epoch 2 Batch 0 Loss 2.1947
Epoch 2 Batch 50 Loss 1.8193
Epoch 2 Batch 100 Loss 1.4726
Epoch 2 Batch 150 Loss 1.8873
Epoch 2 Batch 200 Loss 2.1290
Epoch 2 Batch 250 Loss 1.6640
Epoch 2 Batch 300 Loss 1.5082
Epoch 2 Batch 350 Loss 1.9386
Epoch 2 Batch 400 Loss 1.6389
Epoch 2 Batch 450 Loss 1.7909
Epoch 2 Batch 500 Loss 1.8571
Epoch 2 Batch 550 Loss 1.9686
Epoch 2 Batch 600 Loss 1.3576
Epoch 2 Batch 650 Loss 1.7869
Epoch 2 Batch 700 Loss 1.9336
Epoch 2 Batch 750 Loss 1.4993
Epoch 2 Batch 800 Loss 1.4318
Epoch 2 Batch 850 Loss 1.4867
Epoch 2 Batch 900 Loss 1.4031
Epoch 2 Batch 950 Loss 2.0272
Epoch 2 Batch 1000 Loss 1.7339
Epoch 2 Batch 1050 Loss 1.5601
Epoch 2 Batch 1100 Loss 1.6037
Epoch 2 Batch 1150 Loss 1.8273
Epoch 2 Batch 1200 Loss 1.8662
Epoch 2 Batch 1250 Loss 1.2605
Ep



 10%|█         | 2/20 [27:47<4:10:02, 833.47s/it][A[A

Input: i am hungry.
Predicted translation: me siento de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi de mi 
--------------------
Epoch 3 Batch 0 Loss 1.3279
Epoch 3 Batch 50 Loss 1.0431
Epoch 3 Batch 100 Loss 1.2602
Epoch 3 Batch 150 Loss 1.2742
Epoch 3 Batch 200 Loss 1.3052
Epoch 3 Batch 250 Loss 1.8326
Epoch 3 Batch 300 Loss 1.5773
Epoch 3 Batch 350 Loss 1.3219
Epoch 3 Batch 400 Loss 1.3477
Epoch 3 Batch 450 Loss 1.3178
Epoch 3 Batch 500 Loss 1.5348
Epoch 3 Batch 550 Loss 1.6523
Epoch 3 Batch 600 Loss 1.2127
Epoch 3 Batch 650 Loss 1.2335
Epoch 3 Batch 700 Loss 0.8563
Epoch 3 Batch 750 Loss 1.4674
Epoch 3 Batch 800 Loss 1.6119
Epoch 3 Batch 850 Loss 1.5323
Epoch 3 Batch 900 Loss 1.1348
Epoch 3 Batch 950 Loss 0.9605
Epoch 3 Batch 1000 Loss 1.0583
Epoch 3 Batch 1050 Loss 1.3144
Epoch 3 Batch 1100 Loss 1.1279
Epoch 3 Batch 1150 Loss 1.4836
Epoch 3 Batch 1200 Loss 1.1074
Epoch 3 Batch 1250 Loss 1.422



 15%|█▌        | 3/20 [41:38<3:55:56, 832.71s/it][A[A

Epoch 3 Loss 1.3599
Time taken for 1 epoch 830.9114964008331 sec

Input: i am hungry.
Predicted translation: hoy estoy muy de hambre.
--------------------
Epoch 4 Batch 0 Loss 1.0405
Epoch 4 Batch 50 Loss 1.2972
Epoch 4 Batch 100 Loss 0.9874
Epoch 4 Batch 150 Loss 1.2532
Epoch 4 Batch 200 Loss 1.4359
Epoch 4 Batch 250 Loss 1.2003
Epoch 4 Batch 300 Loss 0.9684
Epoch 4 Batch 350 Loss 1.3970
Epoch 4 Batch 400 Loss 1.0255
Epoch 4 Batch 450 Loss 1.4267
Epoch 4 Batch 500 Loss 1.1360
Epoch 4 Batch 550 Loss 1.4456
Epoch 4 Batch 600 Loss 0.9346
Epoch 4 Batch 650 Loss 1.2365
Epoch 4 Batch 700 Loss 1.4381
Epoch 4 Batch 750 Loss 1.1315
Epoch 4 Batch 800 Loss 1.2751
Epoch 4 Batch 850 Loss 1.1129
Epoch 4 Batch 900 Loss 1.1902
Epoch 4 Batch 950 Loss 1.3255
Epoch 4 Batch 1000 Loss 1.0045
Epoch 4 Batch 1050 Loss 1.2285
Epoch 4 Batch 1100 Loss 0.8902
Epoch 4 Batch 1150 Loss 0.9575
Epoch 4 Batch 1200 Loss 1.0780
Epoch 4 Batch 1250 Loss 1.2316
Epoch 4 Batch 1300 Loss 0.8927
Epoch 4 Batch 1350 Loss 1.1212




 20%|██        | 4/20 [55:33<3:42:14, 833.40s/it][A[A

Epoch 4 Loss 1.1167
Time taken for 1 epoch 834.9470210075378 sec

Input: i am hungry.
Predicted translation: tengo un poco de hambre.
--------------------
Epoch 5 Batch 0 Loss 1.0960
Epoch 5 Batch 50 Loss 1.1548
Epoch 5 Batch 100 Loss 1.0415
Epoch 5 Batch 150 Loss 0.7405
Epoch 5 Batch 200 Loss 0.9987
Epoch 5 Batch 250 Loss 1.0153
Epoch 5 Batch 300 Loss 0.6127
Epoch 5 Batch 350 Loss 1.2661
Epoch 5 Batch 400 Loss 0.8691
Epoch 5 Batch 450 Loss 1.2279
Epoch 5 Batch 500 Loss 1.1368
Epoch 5 Batch 550 Loss 1.0363
Epoch 5 Batch 600 Loss 0.9660
Epoch 5 Batch 650 Loss 1.0456
Epoch 5 Batch 700 Loss 0.8545
Epoch 5 Batch 750 Loss 0.8876
Epoch 5 Batch 800 Loss 0.9563
Epoch 5 Batch 850 Loss 0.8867
Epoch 5 Batch 900 Loss 0.9110
Epoch 5 Batch 950 Loss 1.1923
Epoch 5 Batch 1000 Loss 0.7647
Epoch 5 Batch 1050 Loss 1.0653
Epoch 5 Batch 1100 Loss 0.8744
Epoch 5 Batch 1150 Loss 0.8691
Epoch 5 Batch 1200 Loss 0.6997
Epoch 5 Batch 1250 Loss 0.6503
Epoch 5 Batch 1300 Loss 0.8652
Epoch 5 Batch 1350 Loss 0.9643




 25%|██▌       | 5/20 [1:09:22<3:28:02, 832.14s/it][A[A

Epoch 5 Loss 0.9404
Time taken for 1 epoch 829.1662862300873 sec

Input: i am hungry.
Predicted translation: tengo hambre.
--------------------
Epoch 6 Batch 0 Loss 0.7488
Epoch 6 Batch 50 Loss 0.6772
Epoch 6 Batch 100 Loss 0.8455
Epoch 6 Batch 150 Loss 0.7431
Epoch 6 Batch 200 Loss 1.0757
Epoch 6 Batch 250 Loss 0.7117
Epoch 6 Batch 300 Loss 0.7121
Epoch 6 Batch 350 Loss 0.6591
Epoch 6 Batch 400 Loss 0.6449
Epoch 6 Batch 450 Loss 0.8083
Epoch 6 Batch 500 Loss 0.9395
Epoch 6 Batch 550 Loss 0.7555
Epoch 6 Batch 600 Loss 0.8676
Epoch 6 Batch 650 Loss 0.7994
Epoch 6 Batch 700 Loss 0.8884
Epoch 6 Batch 750 Loss 0.5577
Epoch 6 Batch 800 Loss 0.8838
Epoch 6 Batch 850 Loss 0.9310
Epoch 6 Batch 900 Loss 0.6778
Epoch 6 Batch 950 Loss 0.8795
Epoch 6 Batch 1000 Loss 0.9135
Epoch 6 Batch 1050 Loss 0.5769
Epoch 6 Batch 1100 Loss 0.8629
Epoch 6 Batch 1150 Loss 0.5128
Epoch 6 Batch 1200 Loss 0.6634
Epoch 6 Batch 1250 Loss 0.5670
Epoch 6 Batch 1300 Loss 0.6092
Epoch 6 Batch 1350 Loss 0.8546
Epoch 6 Bat



 30%|███       | 6/20 [1:23:12<3:14:01, 831.53s/it][A[A

Epoch 6 Loss 0.8000
Time taken for 1 epoch 830.0832040309906 sec

Input: i am hungry.
Predicted translation: tengo hambre.
--------------------
Epoch 7 Batch 0 Loss 0.7184
Epoch 7 Batch 50 Loss 0.7816
Epoch 7 Batch 100 Loss 0.7736
Epoch 7 Batch 150 Loss 0.6671
Epoch 7 Batch 200 Loss 0.6230
Epoch 7 Batch 250 Loss 0.7245
Epoch 7 Batch 300 Loss 0.8476
Epoch 7 Batch 350 Loss 0.7354
Epoch 7 Batch 400 Loss 0.4558
Epoch 7 Batch 450 Loss 0.8819
Epoch 7 Batch 500 Loss 0.6589
Epoch 7 Batch 550 Loss 0.5001
Epoch 7 Batch 600 Loss 0.8309
Epoch 7 Batch 650 Loss 0.5647
Epoch 7 Batch 700 Loss 0.6934
Epoch 7 Batch 750 Loss 0.7174
Epoch 7 Batch 800 Loss 0.5848
Epoch 7 Batch 850 Loss 0.4216
Epoch 7 Batch 900 Loss 0.6696
Epoch 7 Batch 950 Loss 0.6939
Epoch 7 Batch 1000 Loss 0.6788
Epoch 7 Batch 1050 Loss 0.6325
Epoch 7 Batch 1100 Loss 0.4505
Epoch 7 Batch 1150 Loss 0.5914
Epoch 7 Batch 1200 Loss 0.7009
Epoch 7 Batch 1250 Loss 0.6959
Epoch 7 Batch 1300 Loss 0.8022
Epoch 7 Batch 1350 Loss 0.6857
Epoch 7 Bat



 35%|███▌      | 7/20 [1:37:05<3:00:15, 831.95s/it][A[A

Epoch 7 Loss 0.6883
Time taken for 1 epoch 832.9204576015472 sec

Input: i am hungry.
Predicted translation: tengo sed.
--------------------
Epoch 8 Batch 0 Loss 0.5037
Epoch 8 Batch 50 Loss 0.5683
Epoch 8 Batch 100 Loss 0.6821
Epoch 8 Batch 150 Loss 0.3830
Epoch 8 Batch 200 Loss 0.6722
Epoch 8 Batch 250 Loss 0.5629
Epoch 8 Batch 300 Loss 0.6232
Epoch 8 Batch 350 Loss 0.7386
Epoch 8 Batch 400 Loss 0.4222
Epoch 8 Batch 450 Loss 0.6232
Epoch 8 Batch 500 Loss 0.7634
Epoch 8 Batch 550 Loss 0.6351
Epoch 8 Batch 600 Loss 0.6618
Epoch 8 Batch 650 Loss 0.5279
Epoch 8 Batch 700 Loss 0.5824
Epoch 8 Batch 750 Loss 0.5638
Epoch 8 Batch 800 Loss 0.6483
Epoch 8 Batch 850 Loss 0.7355
Epoch 8 Batch 900 Loss 0.7046
Epoch 8 Batch 950 Loss 0.5788
Epoch 8 Batch 1000 Loss 0.5152
Epoch 8 Batch 1050 Loss 0.5699
Epoch 8 Batch 1100 Loss 0.7179
Epoch 8 Batch 1150 Loss 0.6164
Epoch 8 Batch 1200 Loss 0.4782
Epoch 8 Batch 1250 Loss 0.5196
Epoch 8 Batch 1300 Loss 0.6344
Epoch 8 Batch 1350 Loss 0.7036
Epoch 8 Batch 



 40%|████      | 8/20 [1:50:59<2:46:29, 832.47s/it][A[A

Epoch 8 Loss 0.5905
Time taken for 1 epoch 833.6507318019867 sec

Input: i am hungry.
Predicted translation: . tengo mucho hambriento.
--------------------
Epoch 9 Batch 0 Loss 0.4199
Epoch 9 Batch 50 Loss 0.4991
Epoch 9 Batch 100 Loss 0.4199
Epoch 9 Batch 150 Loss 0.4834
Epoch 9 Batch 200 Loss 0.4941
Epoch 9 Batch 250 Loss 0.5025
Epoch 9 Batch 300 Loss 0.6776
Epoch 9 Batch 350 Loss 0.4432
Epoch 9 Batch 400 Loss 0.5432
Epoch 9 Batch 450 Loss 0.5459
Epoch 9 Batch 500 Loss 0.5496
Epoch 9 Batch 550 Loss 0.2950
Epoch 9 Batch 600 Loss 0.4084
Epoch 9 Batch 650 Loss 0.5291
Epoch 9 Batch 700 Loss 0.3878
Epoch 9 Batch 750 Loss 0.5790
Epoch 9 Batch 800 Loss 0.3846
Epoch 9 Batch 850 Loss 0.4079
Epoch 9 Batch 900 Loss 0.5414
Epoch 9 Batch 950 Loss 0.5831
Epoch 9 Batch 1000 Loss 0.6507
Epoch 9 Batch 1050 Loss 0.4886
Epoch 9 Batch 1100 Loss 0.6525
Epoch 9 Batch 1150 Loss 0.6392
Epoch 9 Batch 1200 Loss 0.5316
Epoch 9 Batch 1250 Loss 0.5712
Epoch 9 Batch 1300 Loss 0.4785
Epoch 9 Batch 1350 Loss 0.3386



 45%|████▌     | 9/20 [2:04:52<2:32:39, 832.68s/it][A[A

Epoch 9 Loss 0.5083
Time taken for 1 epoch 833.1216471195221 sec

Input: i am hungry.
Predicted translation: . tengo tres hambriento.
--------------------
Epoch 10 Batch 0 Loss 0.4062
Epoch 10 Batch 50 Loss 0.5377
Epoch 10 Batch 100 Loss 0.3661
Epoch 10 Batch 150 Loss 0.6067
Epoch 10 Batch 200 Loss 0.4001
Epoch 10 Batch 250 Loss 0.3902
Epoch 10 Batch 300 Loss 0.3806
Epoch 10 Batch 350 Loss 0.4064
Epoch 10 Batch 400 Loss 0.4916
Epoch 10 Batch 450 Loss 0.4030
Epoch 10 Batch 500 Loss 0.5440
Epoch 10 Batch 550 Loss 0.4939
Epoch 10 Batch 600 Loss 0.4461
Epoch 10 Batch 650 Loss 0.5344
Epoch 10 Batch 700 Loss 0.4663
Epoch 10 Batch 750 Loss 0.3803
Epoch 10 Batch 800 Loss 0.4210
Epoch 10 Batch 850 Loss 0.4480
Epoch 10 Batch 900 Loss 0.4590
Epoch 10 Batch 950 Loss 0.6042
Epoch 10 Batch 1000 Loss 0.5048
Epoch 10 Batch 1050 Loss 0.3846
Epoch 10 Batch 1100 Loss 0.4651
Epoch 10 Batch 1150 Loss 0.4087
Epoch 10 Batch 1200 Loss 0.4203
Epoch 10 Batch 1250 Loss 0.3156
Epoch 10 Batch 1300 Loss 0.3258
Epoc



 50%|█████     | 10/20 [2:18:48<2:18:58, 833.88s/it][A[A

Epoch 10 Loss 0.4348
Time taken for 1 epoch 836.6505455970764 sec

Input: i am hungry.
Predicted translation: . tengo hambriento.
--------------------
Epoch 11 Batch 0 Loss 0.4131
Epoch 11 Batch 50 Loss 0.4709
Epoch 11 Batch 100 Loss 0.4247
Epoch 11 Batch 150 Loss 0.4063
Epoch 11 Batch 200 Loss 0.3599
Epoch 11 Batch 250 Loss 0.3849
Epoch 11 Batch 300 Loss 0.4355
Epoch 11 Batch 350 Loss 0.3635
Epoch 11 Batch 400 Loss 0.3274
Epoch 11 Batch 450 Loss 0.4122
Epoch 11 Batch 500 Loss 0.3759
Epoch 11 Batch 550 Loss 0.3236
Epoch 11 Batch 600 Loss 0.3724
Epoch 11 Batch 650 Loss 0.3956
Epoch 11 Batch 700 Loss 0.4446
Epoch 11 Batch 750 Loss 0.4246
Epoch 11 Batch 800 Loss 0.3833
Epoch 11 Batch 850 Loss 0.3251
Epoch 11 Batch 900 Loss 0.3154
Epoch 11 Batch 950 Loss 0.4383
Epoch 11 Batch 1000 Loss 0.3120
Epoch 11 Batch 1050 Loss 0.3453
Epoch 11 Batch 1100 Loss 0.3273
Epoch 11 Batch 1150 Loss 0.3082
Epoch 11 Batch 1200 Loss 0.3809
Epoch 11 Batch 1250 Loss 0.3872
Epoch 11 Batch 1300 Loss 0.3723
Epoch 11



 55%|█████▌    | 11/20 [2:32:44<2:05:09, 834.36s/it][A[A

Input: i am hungry.
Predicted translation: . estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy 
--------------------
Epoch 12 Batch 0 Loss 0.4046
Epoch 12 Batch 50 Loss 0.3855
Epoch 12 Batch 100 Loss 0.3476
Epoch 12 Batch 150 Loss 0.4100
Epoch 12 Batch 200 Loss 0.3332
Epoch 12 Batch 250 Loss 0.3618
Epoch 12 Batch 300 Loss 0.2088
Epoch 12 Batch 350 Loss 0.3542
Epoch 12 Batch 400 Loss 0.3369
Epoch 12 Batch 450 Loss 0.2572
Epoch 12 Batch 500 Loss 0.2812
Epoch 12 Batch 550 Loss 0.3408
Epoch 12 Batch 600 Loss 0.3948
Epoch 12 Batch 650 Loss 0.3091
Epoch 12 Batch 700 Loss 0.3883
Epoch 12 Batch 750 Loss 0.3704
Epoch 12 Batch 800 Loss 0.3006
Epoch 12 Batch 850 Loss 0.4087
Epoch 12 Batch 900 L



 60%|██████    | 12/20 [2:46:37<1:51:12, 834.08s/it][A[A

Input: i am hungry.
Predicted translation: . estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy 
--------------------
Epoch 13 Batch 0 Loss 0.2851
Epoch 13 Batch 50 Loss 0.2624
Epoch 13 Batch 100 Loss 0.2336
Epoch 13 Batch 150 Loss 0.2717
Epoch 13 Batch 200 Loss 0.3255
Epoch 13 Batch 250 Loss 0.2429
Epoch 13 Batch 300 Loss 0.3076
Epoch 13 Batch 350 Loss 0.2719
Epoch 13 Batch 400 Loss 0.2542
Epoch 13 Batch 450 Loss 0.3580
Epoch 13 Batch 500 Loss 0.2686
Epoch 13 Batch 550 Loss 0.2205
Epoch 13 Batch 600 Loss 0.3744
Epoch 13 Batch 650 Loss 0.3558
Epoch 13 Batch 700 Loss 0.2648
Epoch 13 Batch 750 Loss 0.3005
Epoch 13 Batch 800 Loss 0.3259
Epoch 13 Batch 850 Loss 0.3181
Epoch 13 Batch 900 L



 65%|██████▌   | 13/20 [3:00:30<1:37:16, 833.79s/it][A[A

Input: i am hungry.
Predicted translation: . estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy hambrientosamente estoy 
--------------------
Epoch 14 Batch 0 Loss 0.3060
Epoch 14 Batch 50 Loss 0.2795
Epoch 14 Batch 100 Loss 0.2501
Epoch 14 Batch 150 Loss 0.2601
Epoch 14 Batch 200 Loss 0.2618
Epoch 14 Batch 250 Loss 0.3178
Epoch 14 Batch 300 Loss 0.2533
Epoch 14 Batch 350 Loss 0.2216
Epoch 14 Batch 400 Loss 0.2507
Epoch 14 Batch 450 Loss 0.2699
Epoch 14 Batch 500 Loss 0.2099
Epoch 14 Batch 550 Loss 0.2618
Epoch 14 Batch 600 Loss 0.2470
Epoch 14 Batch 650 Loss 0.2659
Epoch 14 Batch 700 Loss 0.1993
Epoch 14 Batch 750 Loss 0.2626
Epoch 14 Batch 800 Loss 0.1721
Epoch 14 Batch 850 



 70%|███████   | 14/20 [3:14:22<1:23:18, 833.01s/it][A[A

Input: i am hungry.
Predicted translation: . hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambrientoqueria estoy hambriento
--------------------
Epoch 15 Batch 0 Loss 0.2753
Epoch 15 Batch 50 Loss 0.2252
Epoch 15 Batch 100 Loss 0.1355
Epoch 15 Batch 150 Loss 0.2198
Epoch 15 Batch 200 Loss 0.2002
Epoch 15 Batch 250 Loss 0.2053
Epoch 15 Batch 300 Loss 0.2236
Epoch 15 Batch 350 Loss 0.2897
Epoch 15 Batch 400 Loss 0.2176
Epoch 15 Batch 450 Loss 0.1971
Epoch 15 Batch 500 Loss 0.1804
Epoch 15 Batch 550 Loss 0.2230
Epoch 15 Batch 600 Loss 0.2508
Epoch 15 Batch 650 Loss 0.2065
Epoch 15 Batch 700 Loss 0.1989
Epoch 15 Batch 750 Loss 0.1832
Epoch 15 Batch 800 Loss 0.2975
Epoch 15 Batch 850 Loss 0.2241




 75%|███████▌  | 15/20 [3:28:16<1:09:27, 833.42s/it][A[A

Input: i am hungry.
Predicted translation: . hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambrientosamente bebio hambriento
--------------------
Epoch 16 Batch 0 Loss 0.1938
Epoch 16 Batch 50 Loss 0.2586
Epoch 16 Batch 100 Loss 0.1813
Epoch 16 Batch 150 Loss 0.2136
Epoch 16 Batch 200 Loss 0.1829
Epoch 16 Batch 250 Loss 0.1954
Epoch 16 Batch 300 Loss 0.2261
Epoch 16 Batch 350 Loss 0.1576
Epoch 16 Batch 400 Loss 0.1551
Epoch 16 Batch 450 Loss 0.1785
Epoch 16 Batch 500 Loss 0.2029
Epoch 16 Batch 550 Loss 0.1882
Epoch 16 Batch 600 Loss 0.2319
Epoch 16 Batch 650 Loss 0.1693
Epoch 16 Batch 700 Loss 0.1975
Epoch 16 Batch 750 Loss 0.1602
Epoch 16 Batch 800 Loss 0.1751
Epoch 16 Batch 



 80%|████████  | 16/20 [3:42:08<55:32, 833.06s/it]  [A[A

Input: i am hungry.
Predicted translation: . estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy 
--------------------
Epoch 17 Batch 0 Loss 0.1985
Epoch 17 Batch 50 Loss 0.1672
Epoch 17 Batch 100 Loss 0.0956
Epoch 17 Batch 150 Loss 0.1301
Epoch 17 Batch 200 Loss 0.1927
Epoch 17 Batch 250 Loss 0.1435
Epoch 17 Batch 300 Loss 0.1469
Epoch 17 Batch 350 Loss 0.1493
Epoch 17 Batch 400 Loss 0.1473
Epoch 17 Batch 450 Loss 0.1683
Epoch 17 Batch 500 Loss 0.1708
Epoch 17 Batch 550 Loss 0.1658
Epoch 17 Batch 600 Loss 0.1467
Epoch 17 Batch 650 Loss 0.1838
Epoch 17 Batch 700 Loss 0.1703
Epoch 17 Batch 750 Loss 0.1594
Epoch 17 Batch 800 Loss 0.1290
Epoch 17 Batch 850 Loss 0.1469
Epoch 17 Batch 900 L



 85%|████████▌ | 17/20 [3:55:59<41:37, 832.40s/it][A[A

Input: i am hungry.
Predicted translation: . bebio hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy hambrientomente estoy 
--------------------
Epoch 18 Batch 0 Loss 0.2008
Epoch 18 Batch 50 Loss 0.1390
Epoch 18 Batch 100 Loss 0.1647
Epoch 18 Batch 150 Loss 0.1180
Epoch 18 Batch 200 Loss 0.1653
Epoch 18 Batch 250 Loss 0.1471
Epoch 18 Batch 300 Loss 0.1050
Epoch 18 Batch 350 Loss 0.1471
Epoch 18 Batch 400 Loss 0.1341
Epoch 18 Batch 450 Loss 0.2030
Epoch 18 Batch 500 Loss 0.2001
Epoch 18 Batch 550 Loss 0.2050
Epoch 18 Batch 600 Loss 0.1548
Epoch 18 Batch 650 Loss 0.1616
Epoch 18 Batch 700 Loss 0.1304
Epoch 18 Batch 750 Loss 0.1390
Epoch 18 Batch 800 Loss 0.1637
Epoch 18 Batch 850 Loss 0.1945
Epoch 18 Batch 900 L



 90%|█████████ | 18/20 [4:09:51<27:44, 832.33s/it][A[A

Input: i am hungry.
Predicted translation: . castillo estoy vitaminataletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletaletale
--------------------
Epoch 19 Batch 0 Loss 0.1763
Epoch 19 Batch 50 Loss 0.2050
Epoch 19 Batch 100 Loss 0.1382
Epoch 19 Batch 150 Loss 0.1366
Epoch 19 Batch 200 Loss 0.1322
Epoch 19 Batch 250 Loss 0.1437
Epoch 19 Batch 300 Loss 0.1094
Epoch 19 Batch 350 Loss 0.1782
Epoch 19 Batch 400 Loss 0.1652
Epoch 19 Batch 450 Loss 0.1320
Epoch 19 Batch 500 Loss 0.1843
Epoch 19 Batch 550 Loss 0.1317
Epoch 19 Batch 600 Loss 0.1137
Epoch 19 Batch 650 Loss 0.0884
Epoch 19 Batch 700 Loss 0.1321
Epoch 19 Batch 750 Loss 0.1062
Epoch 19 Batch 800 Loss 0.1686
Epoch 19 Batch 850 Loss 0.1394
Epoch 19 Batch 900 Loss 0.1251
Epoch 19 Batch 950 Loss 0.1044
Epoch 19 Batch 1000 Loss 0.1064
Epoch 19 Batch 1050 Loss 0.1290
Epoch 19 Batch 1100 Loss 0.1385
Epoch 19 Batc



 95%|█████████▌| 19/20 [4:23:42<13:51, 831.98s/it][A[A

Input: i am hungry.
Predicted translation: . montontalequeria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy vitaminaqueria estoy 
--------------------
Epoch 20 Batch 0 Loss 0.1351
Epoch 20 Batch 50 Loss 0.1250
Epoch 20 Batch 100 Loss 0.1061
Epoch 20 Batch 150 Loss 0.1506
Epoch 20 Batch 200 Loss 0.1133
Epoch 20 Batch 250 Loss 0.1530
Epoch 20 Batch 300 Loss 0.1119
Epoch 20 Batch 350 Loss 0.1573
Epoch 20 Batch 400 Loss 0.1265
Epoch 20 Batch 450 Loss 0.1701
Epoch 20 Batch 500 Loss 0.1537
Epoch 20 Batch 550 Loss 0.1152
Epoch 20 Batch 600 Loss 0.0949
Epoch 20 Batch 650 Loss 0.1418
Epoch 20 Batch 700 Loss 0.1120
Epoch 20 Batch 750 Loss 0.1351
Epoch 20 Batch 800 Loss 0.1055
Epoch 20 Batch 850 Loss 0.0866
Epoch 20 Batch 900 Loss 0.1398
Epoch 20 



100%|██████████| 20/20 [4:37:40<00:00, 833.02s/it]

Input: i am hungry.
Predicted translation: . castillo estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente estoy vitaminasamente 
--------------------





## Evaluation

### 載入模型

這邊載入模型的方式是先建立模型，再把參數載入模型

In [None]:
model_name = 'checkpoints_seq2seq'

# 建立模型
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_tar_size, embedding_dim, units)

# 模型資訊
checkpoint_path = os.path.join(output_dir, model_name)
ckpt = tf.train.Checkpoint(encoder = encoder, decoder = decoder, optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)

In [None]:
# 查看在資料夾底下有幾個模型
ckpt_manager.checkpoints

['nmt_seq2seq/checkpoints_seq2seq/ckpt-8',
 'nmt_seq2seq/checkpoints_seq2seq/ckpt-9',
 'nmt_seq2seq/checkpoints_seq2seq/ckpt-10']

In [None]:
# 選擇載入哪個模型
ckpt.restore(ckpt_manager.checkpoints[0])

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff64d780630>

### 預測效果

西文翻譯網址: https://context.reverso.net/translation/spanish-english/estados+unidos+es+un+pais+mas+bonito

In [None]:
def translate(sentence, decoder):
  pred, sentence = evaluate(sentence, decoder)

  print('Input:\n %s' % (sentence))
  print('Predicted translation:\n %s' % (pred))

In [None]:
inp_sentence = 'We must go.'
translate(inp_sentence, decoder)

Input:
 we must go.
Predicted translation:
 podemos ir.
