# **EX01. RNN 實作**

## Step01：載入套件

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Step02：測試嵌入層

In [None]:
# 建立模型
seq_model = tf.keras.Sequential()

# 模型只含嵌入層(Embedding layer)
# 字彙表最大為2000，輸出維度為 64，輸入的字數為 10
seq_model.add(layers.Embedding(input_dim=2000, output_dim=64))

# 產生亂數資料，32筆資料，每筆 10 個數字
input_seq_array = np.random.randint(2000, size=(30, 10))

# 指定優化器、損失函數
seq_model.compile('rmsprop', 'mse')

# 預測
output_seq_array = seq_model.predict(input_seq_array)
print(output_seq_array.shape)
output_array[0]

(30, 10, 64)


array([[ 4.34097759e-02, -8.97800922e-03, -3.11054289e-04,
        -1.24567375e-02,  4.70356606e-02,  1.82210095e-02,
        -4.60188463e-03, -3.90012488e-02,  3.33725475e-02,
         3.94629277e-02,  3.09496857e-02, -1.34369358e-02,
         4.69737314e-02,  3.21288146e-02,  2.39550509e-02,
         3.37040685e-02,  9.77888703e-05, -1.40783899e-02,
         4.04952839e-03,  3.08198221e-02,  4.30888794e-02,
        -6.97095320e-03,  2.81312317e-03,  3.37722786e-02,
         4.99996208e-02,  3.06658037e-02,  1.75390579e-02,
         1.33836307e-02, -1.53669231e-02, -3.61397974e-02,
        -6.74518198e-03, -2.60698795e-02, -1.49730332e-02,
        -4.36028242e-02,  1.85554735e-02, -2.28436589e-02,
        -1.22305267e-02, -4.55967449e-02,  2.88151167e-02,
         2.79277824e-02, -3.38440910e-02,  4.78392579e-02,
         4.71436493e-02,  3.24982442e-02,  1.46416165e-02,
        -4.43773158e-02, -5.69744036e-03,  5.65637276e-03,
         2.86322348e-02,  3.80065925e-02, -2.26779338e-0

## Step03：轉換真實資料

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 測試資料
docs = ['Well done!',
        'Great effort',
        'Good work',
        'Excellent!',
        'Good',     
        'Weak',
        'not good',
        'poor work',
        'Poor effort!',
        'Too bad']

# 轉成 one-hot encoding
vocab_size = 60
maxlen = 4
encoded_docs = [one_hot(d, vocab_size) for d in docs]

# 轉成固定長度，長度不足則後面補空白
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

# 模型只有 Embedding
seq_model = tf.keras.Sequential()
seq_model.add(layers.Embedding(vocab_size, 64, input_length=maxlen))
seq_model.compile('rmsprop', 'mse')

# 預測
output_seq_array = seq_model.predict(padded_docs)
output_seq_array.shape

(10, 4, 64)

In [None]:
# one-hot encoding 轉換結果
print(encoded_docs[0])

# 補空白後的輸入維度
print(padded_docs.shape)

[43, 11]
(10, 4)


## Step04：加上完全連接層

In [None]:
# 定義 10 個語句的正面(1)或負面(0)的情緒
labels = np.array([1,0,1,0,1,1,0,0,0,1])

vocab_size = 60
maxlen = 4
encoded_docs = [one_hot(d, vocab_size) for d in docs]
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

seq_model = tf.keras.Sequential()
seq_model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))
seq_model.add(layers.Flatten())

# 加上完全連接層(Dense)
seq_model.add(layers.Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
seq_model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(seq_model.summary())

# 模型訓練
seq_model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型評估
loss, accuracy = seq_model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 4, 8)              480       
_________________________________________________________________
flatten_12 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
Total params: 513
Trainable params: 513
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 89.999998


In [None]:
seq_model.predict(padded_docs)

array([[0.5163412 ],
       [0.4567597 ],
       [0.50977   ],
       [0.5032323 ],
       [0.5239793 ],
       [0.5372017 ],
       [0.48231846],
       [0.4871411 ],
       [0.45932922],
       [0.5369796 ]], dtype=float32)

## Step05：加上 RNN 神經層

In [None]:
seq_model = tf.keras.Sequential()
seq_model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))

# 加上 RNN 神經層，輸出 128 個神經元
seq_model.add(layers.SimpleRNN(64))

# 加上完全連接層(Dense)
seq_model.add(layers.Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
seq_model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(seq_model.summary())
# 模型訓練
seq_model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型評估
loss, accuracy = seq_model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 4, 8)              480       
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 64)                4672      
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 65        
Total params: 5,217
Trainable params: 5,217
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 100.000000


In [None]:
seq_model.predict(padded_docs)

array([[8.4102613e-01],
       [9.7427587e-04],
       [9.9647671e-01],
       [1.0424721e-02],
       [9.9402821e-01],
       [9.9698973e-01],
       [1.7173968e-01],
       [2.0079289e-03],
       [6.0889189e-04],
       [9.9699819e-01]], dtype=float32)

# **EX02. 影評資料集情緒分析**

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
batch_size = 256  # 批量
embedding_output_dims = 20  # 嵌入層輸出維度
max_sequence_length = 400 # 句子最大字數
num_distinct_words = 6000 # 字典
number_of_epochs = 10 # 訓練執行週期
validation_split = 0.30 # 驗證資料比例
verbosity_mode = 1  # 訓練資料訊息顯示程度

In [None]:
# 載入 IMDB 影評資料集，TensorFlow 已將資料轉為索引值
(x_train_imdb, y_train_imdb), (x_test_imdb, y_test_imdb) = imdb.load_data(
    num_words=num_distinct_words)
print(x_train_imdb.shape)
print(x_test_imdb.shape)

# 長度不足時補 0
padded_imdb_inputs = pad_sequences(x_train_imdb, maxlen=max_sequence_length
                              , value = 0.0) 
padded_imdb_inputs_test = pad_sequences(x_test_imdb, maxlen=max_sequence_length
                                   , value = 0.0) 

# 建立模型
imdb_model = Sequential()
imdb_model.add(Embedding(num_distinct_words, embedding_output_dims, 
                    input_length=max_sequence_length))
imdb_model.add(LSTM(10))
imdb_model.add(Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
imdb_model.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=['accuracy'])

# 模型彙總資訊
imdb_model.summary()

(25000,)
(25000,)
Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 400, 20)           120000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 10)                1240      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 121,251
Trainable params: 121,251
Non-trainable params: 0
_________________________________________________________________


In [None]:
x_test_imdb

array([list([1, 591, 202, 14, 31, 6, 717, 10, 10, 2, 2, 5, 4, 360, 7, 4, 177, 5760, 394, 354, 4, 123, 9, 1035, 1035, 1035, 10, 10, 13, 92, 124, 89, 488, 2, 100, 28, 1668, 14, 31, 23, 27, 2, 29, 220, 468, 8, 124, 14, 286, 170, 8, 157, 46, 5, 27, 239, 16, 179, 2, 38, 32, 25, 2, 451, 202, 14, 6, 717]),
       list([1, 14, 22, 3443, 6, 176, 7, 5063, 88, 12, 2679, 23, 1310, 5, 109, 943, 4, 114, 9, 55, 606, 5, 111, 7, 4, 139, 193, 273, 23, 4, 172, 270, 11, 2, 2, 4, 2, 2801, 109, 1603, 21, 4, 22, 3861, 8, 6, 1193, 1330, 10, 10, 4, 105, 987, 35, 841, 2, 19, 861, 1074, 5, 1987, 2, 45, 55, 221, 15, 670, 5304, 526, 14, 1069, 4, 405, 5, 2438, 7, 27, 85, 108, 131, 4, 5045, 5304, 3884, 405, 9, 3523, 133, 5, 50, 13, 104, 51, 66, 166, 14, 22, 157, 9, 4, 530, 239, 34, 2, 2801, 45, 407, 31, 7, 41, 3778, 105, 21, 59, 299, 12, 38, 950, 5, 4521, 15, 45, 629, 488, 2733, 127, 6, 52, 292, 17, 4, 2, 185, 132, 1988, 5304, 1799, 488, 2693, 47, 6, 392, 173, 4, 2, 4378, 270, 2352, 4, 1500, 7, 4, 65, 55, 73, 11, 34

In [None]:
y_test_imdb

array([0, 1, 1, ..., 0, 0, 0])

In [None]:
# 訓練模型
history = imdb_model.fit(padded_imdb_inputs, y_train_imdb, batch_size=batch_size, 
            epochs=number_of_epochs, verbose=verbosity_mode, 
            validation_split=validation_split)

# 模型評估
imdb_test_results = imdb_model.evaluate(padded_imdb_inputs_test, y_test_imdb, verbose=False)
print(f'Loss: {imdb_test_results[0]}, Accuracy: {100*test_results[1]}%')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.3737878203392029, Accuracy: 85.11199951171875%


In [None]:
# 模型存檔
imdb_model.save('LSTM_IMDB.h5')

In [None]:
# 取得字詞與索引的對照表字典
imdb_dict = imdb.get_word_index()
list(imdb_dict.keys())[:20]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


['fawn',
 'tsukino',
 'nunnery',
 'sonja',
 'vani',
 'woods',
 'spiders',
 'hanging',
 'woody',
 'trawling',
 "hold's",
 'comically',
 'localized',
 'disobeying',
 "'royale",
 "harpo's",
 'canet',
 'aileen',
 'acurately',
 "diplomat's"]

In [None]:
# 反轉字典，變成索引與字詞的對照表
imdb_dict_reversed = {}
for k, v in imdb_dict.items():
    imdb_dict_reversed[v] = k

In [None]:
imdb_text = []
for i, line in enumerate(padded_imdb_inputs_test[:8]):
    imdb_text.append('')
    for j, word in enumerate(line):
        if word != 0:
            imdb_text[i] += imdb_dict_reversed[word]+' '

print('\n\n\n'.join(imdb_text))

the wonder own as by is sequence i i and and to of hollywood br of down shouting getting boring of ever it sadly sadly sadly i i was then does don't close and after one carry as by are be and all family turn in does as three part in another some to be probably with world and her an have and beginning own as is sequence 


the as you world's is quite br mankind most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this and and of and storytelling being nasty not of you warren in is failed club i i of films pay so sequences and film okay uses to received and if time done for room sugar viewer as cartoon of gives to forgettable br be because many these of reflection sugar contained gives it wreck scene to more was two when had find as you another it of themselves probably who and storytelling if itself by br about 1950's films not would effects that her box to miike for if hero close seek end is very together movie of and got

In [None]:
imdb_dict_reversed[588]

'please'

In [None]:
imdb_dict['please']

588

# **EX03. 情緒分析**

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Activation, Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split