In [None]:
!nvidia-smi

Wed Nov  4 12:04:19 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    24W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* @file NLP進階 / RNN_IMDB
  * @brief RNN_IMDB 模型實作 

  * 此份程式碼是以教學為目的，附有完整的架構解說。

  * @author 人工智慧科技基金會 AI 工程師 - 康文瑋
  * Email: run963741@aif.tw
  * Resume: https://www.cakeresume.com/run963741

  * 最後更新日期: 2020/11/13

# Recurrent Neural Network

遞歸神經網路擅長處理序列任務，接下來我們要實作的類型是 `many to one`，也就是輸入一串長度為 `n` 的序列給模型，預測一個數值。

* 產線數據: 輸入一筆時間長度為 `3` 的資料，預測該筆資料是正常還是異常。
* 文本分類: 輸入一段句子，預測該筆句子是正面還是負面。
* 腦波: 輸入一段腦波，判斷腦波是正面還是負面情緒。

<figure>
<center>
<img src='https://drive.google.com/uc?export=view&id=1vilEEALDFNf58wHBUwiE5NY1y32mVcqy' width="800"/>
<figcaption>Many to one</figcaption></center>
</figure>

# 載入套件

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import os
import re

from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix

os.chdir('/content/drive/Shared drives/類技術班教材/標準版/NLP進階/RNN 遞歸神經網路')

# 載入資料集

In [None]:
imdb_path = 'Data/IMDB Dataset.csv'
vocab_file = 'vocabulary'

In [None]:
imdb = pd.read_csv(imdb_path)
imdb = imdb.sample(frac=1).reset_index(drop=True)

print(imdb.shape)
imdb.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,"Director Douglas Sirk scores again with this, ...",positive
1,"Gods, I haven't watched a movie this awful in ...",negative
2,Just how exactly do gay Asians manage in a cul...,positive
3,I can't come up with appropriate enough words ...,negative
4,This poor remake of the 1963 classic starts re...,negative


# 資料前處理

In [None]:
# 將 html tag 拿掉
cleanr = re.compile('<.*?>')
imdb['review'] = imdb['review'].map(lambda x: re.sub(cleanr, ' ', x).lower())

In [None]:
imdb.head()

Unnamed: 0,review,sentiment
0,"director douglas sirk scores again with this, ...",positive
1,"gods, i haven't watched a movie this awful in ...",negative
2,just how exactly do gay asians manage in a cul...,positive
3,i can't come up with appropriate enough words ...,negative
4,this poor remake of the 1963 classic starts re...,negative


## 標籤轉換

分類模型訓練時，標籤必須是 `0, 1, 2,...` 的整數。

In [None]:
label_dict = {'positive':0, 'negative':1}
imdb['sentiment'] = imdb['sentiment'].map(label_dict)
Counter(imdb['sentiment'])

Counter({0: 25000, 1: 25000})

## 訓練集 (Train) 與測試集 (Test) 切割


In [None]:
# 訓練集 : 測試集 = 35000 : 15000
training_size = 35000

train_dataset = imdb.iloc[:training_size]
test_dataset = imdb.iloc[training_size:]

In [None]:
print(Counter(train_dataset['sentiment']))
print(Counter(test_dataset['sentiment']))

Counter({1: 17510, 0: 17490})
Counter({0: 7510, 1: 7490})


### 斷詞 (Tokenization) 以及儲存字典 (Vocabulary)

在近來許多自然語言處理模型中，英文的斷詞方式都會使用 `wordpiece (或稱 subword)`，也就是將一個詞分成幾個部分，例如:

* `rewarding` $\rightarrow$ `re`, `ward`, `ing`
* `comfortable` $\rightarrow$ `com`, `fort`, `able`

In [None]:
%%time
try: 
    tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file(vocab_file) 
    print('Load vocabulary: %s' % vocab_file)
except: 
    print('Build vocabulary: %s' % vocab_file)
    tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((w for w in train_dataset['review']),target_vocab_size = 2**12)
    tokenizer.save_to_file(vocab_file)

Load vocabulary: vocabulary
CPU times: user 20.7 ms, sys: 1.68 ms, total: 22.3 ms
Wall time: 532 ms


In [None]:
print('Vocabulary size: ', tokenizer.vocab_size)

Vocabulary size:  4066


### 範例

拿其中一個句子來當範例

In [None]:
tmp_index = 30
tmp_sent = imdb['review'][tmp_index]
tmp_label = imdb['sentiment'][tmp_index]

token_ids = tokenizer.encode(tmp_sent)
token_words = [tokenizer.decode([w]) for w in token_ids]

In [None]:
print('Input sentence: \n', tmp_sent)
print('-'*16)
print('Token ids: \n', token_ids)
print('-'*16)
print('Token words: \n', token_words)
print('-'*16)
print('Label: ', tmp_label)

Input sentence: 
 the orders fatal flaw-besides an asinine plot-is that the character's simply don't resonate or even react.  two examples: a priest, walking through a graveyard late at night, is suddenly attacked by ghostly spirits. after fighting them off, he calmly resumes his walk when his buddy come up. "anything wrong?" his buddy asks, having seen the attack. "just some demonic spirits-nothing i couldn't handle." no reaction, no surprise, just like he'd changed a tire. his buddy is equally unconcerned... must be standard priest training... ["and then you put the wafer into their mouths. any questions? ok, moving on, demon spirit attacks..."]  example two: at one point the priests need an answer to a question, and only a demon (or something, who cared by now) could provide it. how? why, you have to ask a dying man! so the demon has some random person hung in front of the two priests so they can ask their question to the thrashing, gasping man. "hey, don't kill him!" or maybe "that

## Tensorflow data pipeline

`tf.data` 是 `tensorflow` 專用的訓練格式，能夠加速訓練過程。

In [None]:
train_tfdata = tf.data.Dataset.from_tensor_slices((train_dataset['review'].values, train_dataset['sentiment'].values))
test_tfdata = tf.data.Dataset.from_tensor_slices((test_dataset['review'].values, test_dataset['sentiment'].values))

### 資料前處理

#### tf.py_function

若在 `pipeline` 中含有不為 `tensorflow` 的操作方式，就必須使用 `tf.py_function` 將函數的輸入輸出轉換為 `tf.data`。

In [None]:
def encode(sent, label):
  token = tokenizer.encode(sent.numpy())
  label = tf.cast(label, dtype=tf.int32)
  return token, label

def tf_encode(sent, label):
  return tf.py_function(encode, [sent, label], [tf.int32, tf.int32])

https://www.tensorflow.org/datasets/performances

* `.map`: 常常使用函數來資料前處理
* `.cache`: 預先將資料放進記憶體加速
* `.shuffle`: 指定 `buffer_size` 預先放進去記憶體，這樣每次拿 `batch_size` 筆加速運算。
* `padded_batch`: 指定 `batch_size`，還能夠指定 `padded_shapes`，將所有句子都補 0 至統一長度。

In [None]:
buffer_size = 320
batch_size = 64

padded_shapes = (tf.TensorShape([None]), tf.TensorShape([]))

train_generator = train_tfdata.map(tf_encode).cache().shuffle(buffer_size).padded_batch(batch_size, padded_shapes=padded_shapes).repeat()
test_generator = test_tfdata.map(tf_encode).padded_batch(1, padded_shapes=padded_shapes)

In [None]:
x = iter(train_generator)
tmp_inp = next(x)

In [None]:
tmp_inp

(<tf.Tensor: shape=(64, 1234), dtype=int32, numpy=
 array([[ 877,   21,   62, ...,    0,    0,    0],
        [  10, 3859, 3866, ...,    0,    0,    0],
        [  12,    9,    1, ...,    0,    0,    0],
        ...,
        [ 301, 2552, 3150, ...,    0,    0,    0],
        [  11,  815,  227, ...,    0,    0,    0],
        [  12,    9, 2509, ...,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(64,), dtype=int32, numpy=
 array([1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
        1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1],
       dtype=int32)>)

## 建立模型

tensorflow 提供三種建立模型的方法：

1. Sequential API
2. Functional API
3. Model Subclassing

以下為 Sequential API 的寫法。

In [None]:
embedding_dim = 128

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dropout(0.7),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

## Callbacks

In [None]:
model_path = './save_model/checkpoints_imdb_model.keras'  # 模型儲存的位置

# 建立 Checkpoint
checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path,
                                                verbose=1,
                                                monitor='val_loss',    # 儲存模型的指標
                                                save_best_only=True,  # 是否只儲存最好的
                                                mode='min')           # 與指標搭配模式

## 訓練模型

In [None]:
epochs = 10

history = model.fit(train_generator, 
                    epochs=epochs,
                    validation_data=test_generator, 
                    steps_per_epoch = training_size // batch_size, 
                    callbacks = [checkpoint])

Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.40099, saving model to ./save_model/checkpoints_imdb_model.keras
Epoch 2/10
Epoch 00002: val_loss improved from 0.40099 to 0.33554, saving model to ./save_model/checkpoints_imdb_model.keras
Epoch 3/10
Epoch 00003: val_loss improved from 0.33554 to 0.29809, saving model to ./save_model/checkpoints_imdb_model.keras
Epoch 4/10
Epoch 00004: val_loss improved from 0.29809 to 0.29617, saving model to ./save_model/checkpoints_imdb_model.keras
Epoch 5/10
Epoch 00005: val_loss improved from 0.29617 to 0.29304, saving model to ./save_model/checkpoints_imdb_model.keras
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.29304
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.29304
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.29304
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.29304
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.29304


## 評估模型

In [None]:
inv_label_dict = {0:'positive', 1:'negative'}

test_pred = model.predict(test_generator)
test_pred_classes = np.argmax(test_pred, axis=-1)

test_true_classes = test_dataset['sentiment'].values

In [None]:
report = classification_report(y_true=test_true_classes, y_pred=test_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      7510
           1       0.89      0.87      0.88      7490

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [None]:
cnfm = confusion_matrix(y_true=test_true_classes, y_pred=test_pred_classes)
pd.DataFrame(cnfm, columns=['Pred_negative','Pred_positive'], index=['Actual_negative','Actual_positive'])

Unnamed: 0,Pred_negative,Pred_positive
Actual_negative,6709,801
Actual_positive,994,6496
