### 16.2 시퀀스 모델링을 위한 RNN

In [1]:
import tensorflow as tf
tf.random.set_seed(1)
rnn_layer = tf.keras.layers.SimpleRNN(
    units =2, use_bias=True,
    return_sequences=True)
rnn_layer.build(input_shape=(None, None, 5))
w_xh, w_oo, b_h = rnn_layer.weights
print('W_xh 크기:', w_xh.shape)
print('W_oo 크기:', w_oo.shape)
print('b_h 크기:', b_h.shape)

W_xh 크기: (5, 2)
W_oo 크기: (2, 2)
b_h 크기: (2,)


In [2]:
x_seq = tf.convert_to_tensor(
    [[1.0]*5, [2.0]*5, [3.0]*5],
    dtype=tf.float32)
## SimpoleRNN의 출력:
output = rnn_layer(tf.reshape(x_seq, shape=(1,3,5)))
## 수동으로 출력 계산하기:
out_man = []
for t in range(len(x_seq)):
    xt = tf.reshape(x_seq[t], (1, 5))
    print('타임 스텝 {} =>'.format(t))
    print('    입력           :', xt.numpy())
    
    ht = tf.matmul(xt, w_xh) + b_h
    print('    은닉           :', ht.numpy())
    
    if t>0:
        prev_o = out_man[t-1]
    else:
        prev_o = tf.zeros(shape=(ht.shape))
    ot = ht + tf.matmul(prev_o, w_oo)
    ot = tf.math.tanh(ot)
    out_man.append(ot)
    print('    출력 (수동)     :', ot.numpy())
    print('    SimpleRNN  출력 :'.format(t),
          output[0][t].numpy())
    print()

타임 스텝 0 =>
    입력           : [[1. 1. 1. 1. 1.]]
    은닉           : [[0.41464037 0.96012145]]
    출력 (수동)     : [[0.39240566 0.74433106]]
    SimpleRNN  출력 : [0.39240566 0.74433106]

타임 스텝 1 =>
    입력           : [[2. 2. 2. 2. 2.]]
    은닉           : [[0.82928073 1.9202429 ]]
    출력 (수동)     : [[0.80116504 0.9912947 ]]
    SimpleRNN  출력 : [0.80116504 0.9912947 ]

타임 스텝 2 =>
    입력           : [[3. 3. 3. 3. 3.]]
    은닉           : [[1.243921  2.8803642]]
    출력 (수동)     : [[0.95468265 0.9993069 ]]
    SimpleRNN  출력 : [0.95468265 0.9993069 ]



### 16.3 텐서플로로 시퀀스 모델링을 위한 RNN 구현

In [31]:
!pip install tensorflow_datasets



You should consider upgrading via the 'c:\users\ad\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
df = pd.read_csv('./movie_data.csv.gz', encoding = 'utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [4]:
## 1단계: 데이터셋 만들기
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices(
    (df.values, target.values))
## 확인:
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][ :50], ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [5]:
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(
    50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [6]:
## 2단계: 고유 토큰 (단어) 찾기
from collections import Counter
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
print('어휘 사전 크기:', len(token_counts))

어휘 사전 크기: 87007


In [7]:
# 63장 모듈에러
# ##2ST Find token
# from collections import Counter
# tokenizer = tfds.features.text.Tokenizer()
# token_counts = Counter()
# for example in ds_raw_train:
#     tokens = tokenizer.tokenize(example[0].numpy()[0])
#     token_counts.update(tokens)
# print(f'어휘 사전 크기 : {len(token_counts)}')

In [8]:
## 3단계: 고유 토큰을 정수로 인코딩하기
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str))

[232, 9, 270, 1123]


In [9]:
# 63장 모듈에러
# ##3ST 고유토큰을 정수로 인코딩하기
# encoder = tfds.features.text.TokenTextEncoder(token_counts)
# example_str = 'This is an example'
# print(f'{encoder.encode(example_str)}')

In [10]:
## 3-A단계: 변환을 위한 함수 정의
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

In [11]:
## 3-B단계: 함수를 TF 연산으로 변환하기
def encode_map_fn(text, label):
    return tf.py_function(encode, inp = [text, label],
                          Tout = (tf.int64, tf.int64))
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)
# 샘플의 크기 확인하기:
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('시퀀스 길이:', example[0].shape)

시퀀스 길이: (24,)
시퀀스 길이: (179,)
시퀀스 길이: (262,)
시퀀스 길이: (535,)
시퀀스 길이: (130,)


In [12]:
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))
valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))
test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))

In [13]:
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim = 100,
                    output_dim = 6,
                    input_length=20,
                    name='embed-layer'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, 20, 6)             600       
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [14]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

In [15]:
model = Sequential()
model.add(Embedding(input_dim = 1000, output_dim = 32))
model.add(SimpleRNN(32, return_sequences = True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          32000     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [None]:
embedding_dim = 20
vocab_size = len(token_counts) +2
tf.random.set_seed(1)
## 모델 만들기
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim = vocab_size,
        output_dim = embedding_dim,
        name = 'embed-layer'),
    
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(64, name='lstm-layer'),
            name='bidir-lstm'),

        tf.keras.layers.Dense(64, activation = 'relu'),

        tf.keras.layers.Dense(1, activation = 'sigmoid')
])
bi_lstm_model.summary()
## 컴파일과 훈련
bi_lstm_model.compile(
    optimizer = tf.keras.optimizers.Adam(1e-3),
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy'])
history = bi_lstm_model.fit(
    train_data,
    validation_data= valid_data,
    epochs = 10)
## 테스트 데이터에서 평가
test_results = bi_lstm_model.evaluate(test_data)
print('테스트 정확도: {:.2f}%'.format(test_results[1]*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1740180   
_________________________________________________________________
bidir-lstm (Bidirectional)   (None, 128)               43520     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,792,021
Trainable params: 1,792,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 55/625 [=>............................] - ETA: 2:38 - loss: 0.0711 - accuracy: 0.9736

In [None]:
from collections import Counter
def preprocess_datasets(
    ds_raw_train,
    ds_raw_valid,
    ds_raw_test,
    max_seq_length=None,
    batch_size = 32):
    
    ## 1단계: (데이터셋 만들기 이미 완료)
    ## 2단계: 고유 토큰 찾기
    tokenizer = tfds.deprecated.text.Tokenizer()
    token_counts = Counter()
    
    for example in ds_raw_train:
        tokens = tokenizer.tokenize(example[0].numpy()[0])
        if max_seq_length is not None:
            tokens = tokens[-max_seq_length:]
        token_counts.update(tokens)
        
    print('어휘 사전 크기:', len(token_counts))
    
    ## 3단계: 텍스트 인코딩하기
    encoder = tfds.deprecated.text.TokenTextEncorder(
                  token_counts)
    def encode(text_tensor,)