In [None]:
!nvidia-smi

Wed Nov  4 08:19:06 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


* @file NLP進階 / RNN_postagging
  * @brief RNN_postagging 模型實作 

  * 此份程式碼是以教學為目的，附有完整的架構解說。

  * @author 人工智慧科技基金會 AI 工程師 - 康文瑋
  * Email: run963741@aif.tw
  * Resume: https://www.cakeresume.com/run963741

  * 最後更新日期: 2020/11/13

# Recurrent Neural Network

遞歸神經網路擅長處理序列任務，接下來我們要實作的類型是 `many to many`，也就是輸入一串長度為 `n` 的序列給模型，預測一串長度為 `n` 的序列給模型。 

* 看圖說故事 (Image captioning): 輸入一張圖片，輸出該張圖片的描述。
* 詞性標註 (Part-of-Speech tagging): 輸入一段句子，輸出每個詞的詞性。

<figure>
<center>
<img src='https://drive.google.com/uc?export=view&id=1WREpRnryegmSURXPoCeJo5-RIRqkiWlv' width="800"/>
<figcaption>Many\One to many</figcaption></center>
</figure>

# 載入函數

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from nltk.corpus import treebank, brown, conll2000
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

os.chdir('/content/drive/Shared drives/類技術班教材/標準版/NLP進階/RNN 遞歸神經網路')

# 載入資料集

我們使用的是 `NLTK` 套件自帶的資料集，`NLTK` 是外國很知名的自然語言處理套件，支援諸多的自然語言處理流程、任務，例如斷詞 (Tokenization)、詞性標註 (Part of speech tagging)等等。

我們要使用的是 `NLTK` 中的詞性標註資料集，分別是 `treebank`, `brown`, `conll2000`。

In [None]:
import nltk
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')
nltk.download('tagsets')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [None]:
tree_bank = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')
tagged_sents = tree_bank + brown_corpus + conll_corpus

使用 `nltk.help.upenn_tagset()` 可以列出所有詞性標註的標籤解釋。

In [None]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## 資料格式

`NLTK` 的資料格式為 `(word, tagging)`。

In [None]:
tagged_sents[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

# 資料前處理

這邊將每個句子中的 `word` 和 `tagging` 個別挑出來。

In [None]:
words = list()
tags = list()
for tagged_sent in tqdm(tagged_sents):
  word = [t[0] for t in tagged_sent]
  tag = [t[1] for t in tagged_sent]

  words.append(word)
  tags.append(tag)

100%|██████████| 72202/72202 [00:05<00:00, 13024.90it/s]


In [None]:
print_index = 0
print(words[print_index])
print('-'*16)
print(tags[print_index])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
----------------
['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


### 建立字典 (vocabulary)

所有 NLP 模型都需要建立字典，在字典中會紀錄所有不重複的詞以及詞的 `index`，例如：

$$
\begin{aligned}
自然 : 0 \\
語言 : 1 \\
處理 : 2 
\end{aligned}
$$

記錄這些詞的目的是為了跟 `word embedding` 做匹配，在訓練模型時，會先將句子中的每個詞透過字典轉換為 `index`，然後再轉換成 `word embedding`，接著輸入模型進行訓練。

In [None]:
def set_vocab(words):
  set_words = list(set([t1 for t2 in words for t1 in t2]))
  words_index = np.array(range(len(set_words))) + 1 # + 1 表示從 1 開始做 word index
  word_to_index = dict(zip(set_words, words_index))
  index_to_word = dict(zip(words_index, set_words))

  return word_to_index, index_to_word

In [None]:
word_to_index, index_to_word = set_vocab(words)
tag_to_index, index_to_tag = set_vocab(tags)

In [None]:
word_vocab_size = list(index_to_word)[-1] + 1
tag_vocab_size = list(index_to_tag)[-1] + 1

In [None]:
print('Word vocabulary size: ', word_vocab_size)
print('Tag vocabulary size: ', tag_vocab_size)

Word vocabulary size:  67068
Tag vocabulary size:  13


在字典中，會以 `dictionary` 的資料格式來儲存每個詞。

In [None]:
tag_to_index

{'.': 1,
 'ADJ': 3,
 'ADP': 11,
 'ADV': 5,
 'CONJ': 12,
 'DET': 2,
 'NOUN': 7,
 'NUM': 6,
 'PRON': 8,
 'PRT': 4,
 'VERB': 9,
 'X': 10}

## 訓練集 (Train) 與測試集 (Test) 切割

因為後面需要將資料轉換為 `tf.data` 格式，所以這邊切訓練集以及測試集時需要使用 `' '` 將 `list` 轉換為字串，才能夠輸入給 `.from_tensor_slices`。

In [None]:
X_train, X_test, y_train, y_test = train_test_split([' '.join(w) for w in words], 
                                                    [' '.join(w) for w in tags], test_size=0.2)

In [None]:
print('Training data size: %d' % len(X_train))
print('Testing data size: %d' % len(X_test))

Training data size: 57761
Testing data size: 14441


### Tensorflow data pipeline

`tf.data` 是 `tensorflow` 專用的訓練格式，能夠加速訓練過程。

In [None]:
# from_tensor_slices 裡面放的是 (資料, 標籤)
train_tfdata = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_tfdata = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
# tf.data 可以使用 generator 的方式來獲取資料
x = iter(train_tfdata)
tmp_inp = next(x)

In [None]:
# tf.data 裡都是以 tf.Tensor 的格式
tmp_inp

(<tf.Tensor: shape=(), dtype=string, numpy=b'As a groundwork for the proposal I give some attention to the first task enumerated above , the clarification of goal .'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'ADP DET NOUN ADP DET NOUN PRON VERB DET NOUN ADP DET ADJ NOUN VERB ADV . DET NOUN ADP NOUN .'>)

### 資料前處理

#### tf.py_function

若在 `pipeline` 中含有不為 `tensorflow` 的操作方式，就必須使用 `tf.py_function` 將函數的輸入輸出轉換為 `tf.data`。

In [None]:
# 將 word 和 tag 使用 ' ' 分開來
def encode(word, tag):
  word = [word_to_index[t] for t in word.numpy().decode().split(' ')]
  tag = [tag_to_index[t] for t in tag.numpy().decode().split(' ')]
  return word, tag

# 使用 tf.py_function 將 encode 轉換為 tf.data
def tf_encode(word, tag):
  return tf.py_function(encode, [word, tag], [tf.int32, tf.int32])

https://www.tensorflow.org/datasets/performances

* `.map`: 常常使用函數來資料前處理
* `.cache`: 預先將資料放進記憶體加速
* `.shuffle`: 指定 `buffer_size` 預先放進去記憶體，這樣每次拿 `batch_size` 筆加速運算。
* `padded_batch`: 指定 `batch_size`，還能夠指定 `padded_shapes`，將所有句子都補 0 至統一長度。

In [None]:
buffer_size = 320
batch_size = 32

padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None]))

train_generator = train_tfdata.map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE).cache().shuffle(buffer_size).padded_batch(batch_size, padded_shapes=padded_shapes).repeat()
test_generator = test_tfdata.map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE).padded_batch(batch_size, padded_shapes=padded_shapes)

In [None]:
x = iter(train_generator)
tmp_inp = next(x)

In [None]:
tmp_inp

(<tf.Tensor: shape=(32, 55), dtype=int32, numpy=
 array([[21466, 46086, 48164, ...,     0,     0,     0],
        [16201, 48178, 45380, ...,     0,     0,     0],
        [21466, 40685, 54125, ...,     0,     0,     0],
        ...,
        [21466,  9326, 48178, ...,     0,     0,     0],
        [52813, 52823, 24361, ...,     0,     0,     0],
        [38169, 19411,  9013, ...,     0,     0,     0]], dtype=int32)>,
 <tf.Tensor: shape=(32, 55), dtype=int32, numpy=
 array([[1, 9, 8, ..., 0, 0, 0],
        [5, 1, 8, ..., 0, 0, 0],
        [1, 8, 9, ..., 0, 0, 0],
        ...,
        [1, 5, 1, ..., 0, 0, 0],
        [2, 3, 7, ..., 0, 0, 0],
        [7, 9, 9, ..., 0, 0, 0]], dtype=int32)>)

## 建立模型

tensorflow 提供三種建立模型的方法：

1. Sequential API
2. Functional API
3. Model Subclassing

以下為 Model Subclassing 的寫法。

In [None]:
class postag_rnn(tf.keras.Model):
  """
  model subclassing 的寫法要使用繼承，繼承 tf.keras.Model
  這樣才能使用 model.fit, model.predict 等等函數
  """
  def __init__(self, embedding_size, rnn_units):
    super().__init__()
    # 建立 word embedding lookup
    self.embedding = tf.keras.layers.Embedding(input_dim=word_vocab_size, output_dim=embedding_size)
    # 建立 lstm 模型
    self.lstm = tf.keras.layers.LSTM(rnn_units, recurrent_initializer='glorot_uniform', return_sequences=True)
    # 建立輸出層
    output_layer = tf.keras.layers.Dense(units=tag_vocab_size, activation='softmax')
    # 因為每個位置都要預測，所以要使用 TimeDistributed，重複利用 output_layer
    self.timedistributed = tf.keras.layers.TimeDistributed(output_layer)

  def call(self, x):
    """
    embedding: 將每個字轉換成向量，一個句子就變成矩陣
    lstm: 每個詞向量依序輸入模型，每個位置依序輸出 hidden state
    timedistributed: 每個 hidden state 輸入全連結層，輸出長度為 vocab_size 的向量
    """
    embedded = self.embedding(x)
    hidden_states = self.lstm(embedded)
    outputs = self.timedistributed(hidden_states)

    return outputs

  def _model(self):
        x = tf.keras.layers.Input(shape=(20))
        return tf.keras.Model(inputs=[x], outputs=self.call(x))


In [None]:
embedding_size = 256
rnn_units = 512

tmp_model = postag_rnn(embedding_size, rnn_units)
tmp_model._model().summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 20, 256)           17169408  
_________________________________________________________________
lstm (LSTM)                  (None, 20, 512)           1574912   
_________________________________________________________________
time_distributed (TimeDistri (None, 20, 13)            6669      
Total params: 18,750,989
Trainable params: 18,750,989
Non-trainable params: 0
_________________________________________________________________


## 編譯模型

In [None]:
embedding_size = 256
rnn_units = 512

model = postag_rnn(embedding_size, rnn_units)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

## Callbacks

In [None]:
model_path = './save_model/checkpoints_postag_model.keras'  # 模型儲存的位置

# 建立 Checkpoint
checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path,
                                                verbose=1,
                                                monitor='val_loss',    # 儲存模型的指標
                                                save_best_only=True,  # 是否只儲存最好的
                                                mode='min')           # 與指標搭配模式

## 訓練模型

In [None]:
model.fit(train_generator, 
          epochs=5, 
          validation_data=test_generator, 
          steps_per_epoch = len(X_train) // batch_size + 1,
          callbacks = [checkpoint])

Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.05211, saving model to ./save_model/checkpoints_postag_model.keras
Epoch 2/5
Epoch 00002: val_loss improved from 0.05211 to 0.04751, saving model to ./save_model/checkpoints_postag_model.keras
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.04751
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.04751
Epoch 5/5
Epoch 00005: val_loss did not improve from 0.04751


<tensorflow.python.keras.callbacks.History at 0x7f7140281748>

## 評估模型

In [None]:
# 儲存預測結果
testing_preds = list()
# 儲存真實標籤 tag
testing_true = list()

# 第一個迴圈預測預測整個句子
for test in tqdm(test_generator):
  words, tags = test
  testing_pred = model.predict(words)
  testing_pred_index = np.argmax(testing_pred, axis=-1)
  # 第二個迴圈將預測值以及真實標籤儲存起來
  for i in range(len(tags)):
    testing_preds.append([p for p in testing_pred_index[i] if p != 0])
    testing_true.append([p for p in tags[i].numpy() if p != 0])

100%|██████████| 452/452 [00:22<00:00, 19.75it/s]


In [None]:
# 印出第 5 筆
print_index = 5

word = X_test[print_index]
pred = testing_preds[print_index]
true = testing_true[print_index]

pred_tag = [index_to_tag[t] for t in pred]
true_tag = [index_to_tag[t] for t in true]

In [None]:
print('Input words: \n', word)
print('Prediction: \n', pred_tag)
print('True: \n', true_tag)

Input words: 
 School teachers , all too unprepared for the job they must do , will need demonstrators .
Prediction: 
 ['NOUN', 'NOUN', '.', 'PRT', 'ADV', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'VERB', '.', 'VERB', 'VERB', 'NOUN', '.']
True: 
 ['NOUN', 'NOUN', '.', 'PRT', 'ADV', 'ADJ', 'ADP', 'DET', 'NOUN', 'PRON', 'VERB', 'VERB', '.', 'VERB', 'VERB', 'NOUN', '.']
