# 양방향 LSTM & Attention Mechanism

## IMDB dataset

In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

In [3]:
X_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

이미 정수 인코딩 완. -> 패딩 필요

In [4]:
import numpy as np
print('Max length of Review : {}'.format(max(len(l) for l in X_train)))
print('AVG length of Review : {}'.format(sum(map(len, X_train)) / len(X_train)))
print('AVG length of Review : {}'.format(np.mean([ len(l) for l in X_train ])))

Max length of Review : 2494
AVG length of Review : 238.71364
AVG length of Review : 238.71364


In [5]:
# 평균보다 조금 크게 패딩 진행
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

print(X_train.shape)
print(X_test.shape)

(25000, 500)
(25000, 500)


## Bahdanau Attention

스코어함수를 활용하여 어텐션 메커니즘 구현 가능
$ \text{score(query, key)} = V^T tanh(W_1 key + W_2 query) $

텍스트 분류에서 어텐션 메커니즘을 사용하는 이유? -> 과거 time step의 정보를 마지막에 한번 더 사용하기 위함!
- RNN의 마지막 은닉 상태는 예측을 위해 사용
- 하지만 이미 마지막 은닉 상태는 몇 가지 유용한 정보들을 손실한 상태임 -> RNN이 time step을 지나며 손실했던 정보들을 다시 참고함
- 다시 말해서 RNN의 모든 은닉 상태들을 다시 한 번 참고 하겠다는 소리임

In [6]:
import tensorflow as tf

In [7]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, values, query): # where key == value
        # query shape == (batch_size, hidden size)
        # hidden with time axis shape == (batch_size, 1, hidden size)
        # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원 변경 진행
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [8]:
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers
import os

In [9]:
sequence_input = Input(shape=(max_len, ), dtype='int32')
embedded_sequences = Embedding(vocab_size, 128, input_length=max_len, mask_zero=True)(sequence_input)
# 10000개의 단어들을 128차원의 벡터로 임베딩

Metal device set to: Apple M1 Pro


2022-12-20 09:39:00.777457: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-20 09:39:00.777588: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
# BI LSTM 설계
# 양방향 LSTM 두 층 설계 --> 첫번째 층 return_sequences=True
lstm = Bidirectional(LSTM(64, dropout=0.5, return_sequences=True))(embedded_sequences)

In [11]:
lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(64,
                                                                        dropout=0.5,
                                                                        return_sequences=True,
                                                                        return_state=True))(lstm)

In [12]:
print(lstm.shape, forward_h.shape, forward_c.shape, backward_h.shape, backward_c.shape)

(None, 500, 128) (None, 64) (None, 64) (None, 64) (None, 64)


순방향 LSTM의 은닉상태와 셀상태를 forward_h, forward_c에 저장하고, 역방향 LSTM의 은닉상태와 셀 상태를 backward_h, backward_c에 저장

각 은닉 상태나 셀 상태의 경우에는 128차원을 갖는데, lstm의 경우에는 (500 x 128)의 크기를 가짐. forward 방향과 backward 방향이 연결된 hidden_state벡터가 모든 시점에서 존재함을 의미.

양방향 LSTM을 사용할 경우에는 순방향 LSTM과 역방향 LSTM 각각 은닉 상태와 셀 상태를 가지므로, 양방향 LSTM의 은닉 상태와 셀 상태를 사용하려면 두 방향의 LSTM의 상태들을 연결해주면 됨.

In [13]:
state_h = Concatenate()([forward_h, backward_h]) # 은닉 상태
state_c = Concatenate()([forward_c, backward_c]) # 셀 상태

# 어텐션 메커니즘에서는 은닉 상태를 사용함. 이를 입력으로 컨텍스트 벡터를 어등ㄹ 수 있음

In [14]:
attention = BahdanauAttention(64) # 가중치 크기 정의

context_vector, attention_weights = attention(lstm, state_h)

컨텍스트 벡터를 밀집층에 통과시키고, 이진 분류이므로 최종 츨력층에 1개의 뉴런을 배치, 활성화 함수로 시그모이드를 사용

In [15]:
dense1 = Dense(20, activation='relu')(context_vector)
dropout = Dropout(0.5)(dense1)
output = Dense(1, activation='sigmoid')(dropout)
model = Model(inputs=sequence_input, outputs=output)

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=3, batch_size=256, validation_data=(X_test, y_test), verbose=1)

Epoch 1/3


2022-12-20 09:39:27.471201: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-20 09:39:31.031277: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-20 09:39:31.678776: W tensorflow/core/common_runtime/forward_type_inference.cc:332] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_42/output/_24'
2022-12-20 09:39:31.682257: I tensorflow/core/grappler/optimizers/custom_grap

 4/98 [>.............................] - ETA: 35:51:02 - loss: 0.6932 - accuracy: 0.4932

In [None]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [1]:
print(1)

1
