### STEP 1. 引入相關的函數庫

In [None]:
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Activation, dot, concatenate, Dropout, BatchNormalization, TimeDistributed
import numpy as np
import os
from IPython.display import Image
%matplotlib inline

### STEP 2. 相關的參數

In [None]:
batch_size = 64 # 訓練時的批次數量
latent_dim = 256 # 編碼後的潛在空間的維度(dimensions of latent space)
num_samples = 15000 # 用來訓練的樣本數

### STEP 3.資料的前處理

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 資料向量化
input_texts = []
target_texts = []
input_characters = set() # 英文字符集
target_characters = set() # 中文字符集
lines = open('/content/drive/MyDrive/02NLP/cmn-tw.txt', mode='r', encoding='utf-8').read().split('\n')

# 逐行的讀取與處理
for line in lines[: min(num_samples, len(lines)-1)]:
    input_text, target_text = line.split('\t')

    # 我們使用“tab”作為“開始序列[SOS]”字符或目標，“\n”作為“結束序列[EOS]”字符。 <-- **重要
    target_text = '\t' + target_text + '\n'

    input_texts.append(input_text)
    target_texts.append(target_text)

    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters)) # 全部輸入的字符集
target_characters = sorted(list(target_characters)) # 全部目標字符集

num_encoder_tokens = len(input_characters) # 所有輸入字符的數量
num_decoder_tokens = len(target_characters) # 所有目標字符的數量

max_encoder_seq_length = max([len(txt) for txt in input_texts]) # 最長的輸入句子長度
max_decoder_seq_length = max([len(txt) for txt in target_texts]) # 最長的目標句子長度

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

# 輸入字符的索引字典
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])

# 目標字符的索引字典
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

print(input_token_index)
print(target_token_index)

Number of samples: 15000
Number of unique input tokens: 73
Number of unique output tokens: 2435
Max sequence length for inputs: 40
Max sequence length for outputs: 25
{' ': 0, '!': 1, '"': 2, '$': 3, '%': 4, "'": 5, ',': 6, '-': 7, '.': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, ':': 19, '?': 20, 'A': 21, 'B': 22, 'C': 23, 'D': 24, 'E': 25, 'F': 26, 'G': 27, 'H': 28, 'I': 29, 'J': 30, 'K': 31, 'L': 32, 'M': 33, 'N': 34, 'O': 35, 'P': 36, 'Q': 37, 'R': 38, 'S': 39, 'T': 40, 'U': 41, 'V': 42, 'W': 43, 'Y': 44, 'Z': 45, 'a': 46, 'b': 47, 'c': 48, 'd': 49, 'e': 50, 'f': 51, 'g': 52, 'h': 53, 'i': 54, 'j': 55, 'k': 56, 'l': 57, 'm': 58, 'n': 59, 'o': 60, 'p': 61, 'q': 62, 'r': 63, 's': 64, 't': 65, 'u': 66, 'v': 67, 'w': 68, 'x': 69, 'y': 70, 'z': 71, '’': 72}
{'\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '%': 5, ',': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B'

In [None]:
# 包含英文句子的2維形狀數組（num_pairs，max_english_sentence_length）
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length), dtype='float32')

# 包含中文句子的2維形狀數組（num_pairs，max_chinese_sentence_length）
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length), dtype='float32')

# decoder_target_data與decoder_input_data偏移了一個時間步長。one hot?
# decoder_target_data [:, t，：]將與decoder_input_data [：，t + 1]- one hot 相同
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

# 把資料轉換成要用來訓練用的張量資料結構 <-- 重要 label encoding
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]

    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

### STEP 4.構建網絡架構



In [None]:
# ===== 編碼 (encoder) ====

# 定義輸入的序列
# 注意：因為輸入序列長度(timesteps)可變的情況，使用input_shape =（None,）
encoder_inputs = Input(shape=(None,), name='encoder_input')
enc_embed = Embedding(num_encoder_tokens+1, 128, input_length=None, name='enc_embedding')
encoder = LSTM(latent_dim, return_sequences=True, return_state=True, name='encoder_lstm') # ****
# 需要取得LSTM的內部state, 因此設定"return_state=True"
# attention 需要取得 LSTMreturn的sequence, 因此設定"return_sequence=True"
encoder_outputs, state_h, state_c = encoder(enc_embed(encoder_inputs))
# 我們拋棄掉`encoder_outputs`因為我們只需要LSTM cell的內部state參數
encoder_states = [state_h, state_c]
encoder_outputs_3 = [encoder_outputs, state_h, state_c]

# ==== 解碼 (decoder) ====
# 設定解碼器(decoder)
# 注意：因為輸出序列的長度(timesteps)是變動的，使用input_shape =（None,）
decoder_inputs = Input(shape=(None,), name='decoder_input')
dec_embed = Embedding(num_decoder_tokens+1, 128, input_length=None, name='dec_embedding')
# 我們設定我們的解碼器回傳整個輸出的序列同時也回傳內部的states參數
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')

# 在訓練時我們不會使用這些回傳的states, 但是在預測時我們會用到這些states參數
# **解碼器的初始狀態是使用編碼器的最後的狀態(states)**
decoder_outputs, _, _ = decoder_lstm(dec_embed(decoder_inputs), initial_state=encoder_states) #我們使用`encoder_states`來做為初始值(initial state)
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs], axes=[2,1])
decoder_outputs = concatenate([context, decoder_outputs])
concat_dense = Dense(num_decoder_tokens, activation='tanh', name='concat_output')
decoder_outputs = concat_dense(decoder_outputs)
second_dense = Dense(num_decoder_tokens, activation='tanh', name='second_output')
decoder_outputs = second_dense(decoder_outputs)

# 接密集層(dense)來進行softmax運算每一個字符可能的機率
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_output')
decoder_outputs = decoder_dense(decoder_outputs)

# 定義一個模型接收encoder_input_data` & `decoder_input_data`做為輸入而輸出`decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 打印出模型結構
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 enc_embedding (Embedding)      (None, None, 128)    9472        ['encoder_input[0][0]']          
                                                                                                  
 dec_embedding (Embedding)      (None, None, 64)     155904      ['decoder_input[0][0]']          
                                                                                              

### STEP 5.訓練模型

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('./s2s.h5', monitor='val_loss', mode="min", save_best_only=True, verbose=1)
# earlystopping = keras.callbacks.EarlyStopping(monitor='val_loss', mode="min", patience=5, verbose=1)
# rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, mode='auto', min_delta=0.0001)

# 設定模型超參數
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# 開始訓練
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=250,
          validation_split=0.2,
          callbacks=[checkpoint])

Epoch 1/250
Epoch 1: val_loss improved from inf to 2.26408, saving model to ./s2s.h5
Epoch 2/250
Epoch 2: val_loss improved from 2.26408 to 2.05721, saving model to ./s2s.h5
Epoch 3/250
Epoch 3: val_loss improved from 2.05721 to 1.91355, saving model to ./s2s.h5
Epoch 4/250
Epoch 4: val_loss improved from 1.91355 to 1.84758, saving model to ./s2s.h5
Epoch 5/250
Epoch 5: val_loss improved from 1.84758 to 1.81951, saving model to ./s2s.h5
Epoch 6/250
Epoch 6: val_loss improved from 1.81951 to 1.73953, saving model to ./s2s.h5
Epoch 7/250
Epoch 7: val_loss improved from 1.73953 to 1.71456, saving model to ./s2s.h5
Epoch 8/250
Epoch 8: val_loss improved from 1.71456 to 1.67977, saving model to ./s2s.h5
Epoch 9/250
Epoch 9: val_loss improved from 1.67977 to 1.65874, saving model to ./s2s.h5
Epoch 10/250
Epoch 10: val_loss improved from 1.65874 to 1.61260, saving model to ./s2s.h5
Epoch 11/250
Epoch 11: val_loss improved from 1.61260 to 1.59058, saving model to ./s2s.h5
Epoch 12/250
Epoch 12

<keras.callbacks.History at 0x7f561e144e90>

### STEP 6.模型預測

![seq2seq_predict](https://4.bp.blogspot.com/-6DALk3-hPtA/WO04i5GgXLI/AAAAAAAABtc/2t9mYz4nQDg9jLoHdTkywDUfxIOFJfC_gCLcB/s640/Seq2SeqDiagram.gif)

seq2seq+atention

In [None]:
# 定義要進行取樣的模型

# 定義編碼器(encoder)的模型
encoder_model = Model(encoder_inputs, encoder_outputs_3) # encoder_outputs is needed for attention

# 定義解碼器LSTM cell的初始權重輸入
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
encoder_outputs_holder = Input(shape=(None, latent_dim))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
encoder_outputs_3holder = [encoder_outputs_holder, decoder_state_input_h, decoder_state_input_c]

# 解碼器(decoder)定義初始狀態(initial decoder state)
decoder_outputs, state_h, state_c = decoder_lstm(
    dec_embed(decoder_inputs), initial_state=decoder_states_inputs) #我們使用`decoder_states_inputs`來做為初始值(initial state)
decoder_states = [state_h, state_c] # train 沒用到, inference 會用到

attention = dot([decoder_outputs, encoder_outputs_holder], axes=[2, 2])
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs_holder], axes=[2,1])
decoder_outputs = concatenate([context, decoder_outputs])
decoder_outputs = concat_dense(decoder_outputs)
decoder_outputs = second_dense(decoder_outputs)
decoder_outputs = decoder_dense(decoder_outputs)

# 定義解碼器(decoder)的模型
decoder_model = Model(
    [decoder_inputs] + encoder_outputs_3holder,
    [decoder_outputs] +  decoder_states)


# 反向查找字符索引來將序列解碼為可讀的內容。
# reverse_input_char_index = dict(
#     (i, char) for char, i in input_token_index.items())

reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

# 對序列進行解碼
def decode_sequence(input_seq):
    # 將輸入編碼成為state向量
    encoder_outputs = encoder_model.predict(input_seq)
    # 產生長度為1的空白目標序列
    target_seq = np.zeros((1, 1))

    # 發佈特定的目標序列起始字符"[SOS]",在這個範例中是使用 "\t"字符
    target_seq[0, 0] = target_token_index['\t'] # decoder_input_data

    # 對批次的序列進行抽樣迴圈
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + encoder_outputs)

        # 對符標抽樣
        sampled_token_index = np.argmax(output_tokens[0, -1, :]) # decoder_output_data
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # 停止迴圈的條件: 到達最大的長度或是找到"停止[EOS]"字符,在這個範例中是使用 "\n"字符
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # 更新目標序列(of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # 更新 states
        encoder_outputs = [encoder_outputs[0], h, c] # update h & c

    return decoded_sentence


for seq_index in range(100):
    # 從訓練集中取出一個序列並試著解碼
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Hi.
Decoded sentence: 你好。

-
Input sentence: Hi.
Decoded sentence: 你好。

-
Input sentence: Run.
Decoded sentence: 你用跑的。

-
Input sentence: Wait!
Decoded sentence: 等等！

-
Input sentence: Hello!
Decoded sentence: 你好。

-
Input sentence: I try.
Decoded sentence: 讓我來。

-
Input sentence: I won!
Decoded sentence: 我贏了。

-
Input sentence: Oh no!
Decoded sentence: 不會吧。

-
Input sentence: Cheers!
Decoded sentence: 乾杯!

-
Input sentence: He ran.
Decoded sentence: 他跑了。

-
Input sentence: Hop in.
Decoded sentence: 跳進來。

-
Input sentence: I lost.
Decoded sentence: 我迷失了。

-
Input sentence: I quit.
Decoded sentence: 我退出。

-
Input sentence: I'm OK.
Decoded sentence: 我沒事。

-
Input sentence: Listen.
Decoded sentence: 聽著。

-
Input sentence: No way!
Decoded sentence: 沒門！

-
Input sentence: No way!
Decoded sentence: 沒門！

-
Input sentence: Really?
Decoded sentence: 你確定？

-
Input sentence: Try it.
Decoded sentence: 試試吧。

-
Input sentence: We try.
Decoded sentence: 我們來試試。

-
Input sentence: Why