In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
tf.random.set_seed(42)

In [3]:
h = [1, 0, 0, 0]
e = [0, 1, 0, 0]
l = [0, 0, 1, 0]
o = [0, 0, 0, 1]

In [4]:
x_data = np.array([[h]], dtype=np.float32)
x_data

array([[[1., 0., 0., 0.]]], dtype=float32)

In [5]:
# sample 수(배치), token 수, token의 임베딩
x_data.shape

(1, 1, 4)

In [6]:
hidden_size = 2

# 마지막 레이어의 units가 2니까 output은 2
cell = layers.SimpleRNNCell(units = hidden_size)

# return_sequences: bool 형태로 T면 output 전체를 출력하는 것이고, F면 마지막 output만 출력
# return_state: bool 형태로 output에 더해 마지막 상태도 반환할지 결정하는 것
rnn = layers.RNN(cell, return_sequences = True, return_state = True)
output, states = rnn(x_data)

print('x_data : {} \t shape : {}'.format(x_data, x_data.shape))
print('output : {} \t shape : {}'.format(output, output.shape))
print('states: {} \t shape: {}'.format(states, states.shape))

x_data : [[[1. 0. 0. 0.]]] 	 shape : (1, 1, 4)
output : [[[ 0.31773362 -0.11744198]]] 	 shape : (1, 1, 2)
states: [[ 0.31773362 -0.11744198]] 	 shape: (1, 2)


2021-07-30 09:27:55.565770: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# cell 과 rnn 합치기

rnn = layers.SimpleRNN(units=hidden_size, return_sequences = True, return_state = True)

output, states = rnn(x_data)

print('x_data : {} \t shape : {}'.format(x_data, x_data.shape))
print('output : {} \t shape : {}'.format(output, output.shape))
print('states: {} \t shape: {}'.format(states, states.shape))

x_data : [[[1. 0. 0. 0.]]] 	 shape : (1, 1, 4)
output : [[[0.44843182 0.251574  ]]] 	 shape : (1, 1, 2)
states: [[0.44843182 0.251574  ]] 	 shape: (1, 2)


In [8]:
### 여러개 넣어보기
### shape를 이해하자!

In [9]:
x_data = np.array([[h,e,l,l,o]], dtype=np.float32)

In [10]:
x_data.shape

(1, 5, 4)

In [11]:
# cell 과 rnn 합치기

rnn = layers.SimpleRNN(units=hidden_size, return_sequences = True, return_state = True)

output, states = rnn(x_data)

print('x_data : {} \t shape : {}'.format(x_data, x_data.shape))
print('output : {} \t shape : {}'.format(output, output.shape))
print('states: {} \t shape: {}'.format(states, states.shape))

x_data : [[[1. 0. 0. 0.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]]] 	 shape : (1, 5, 4)
output : [[[ 0.54152584 -0.00444528]
  [-0.24527906  0.8775963 ]
  [ 0.81667435 -0.7172945 ]
  [-0.39880708  0.19528553]
  [ 0.4313137  -0.69072646]]] 	 shape : (1, 5, 2)
states: [[ 0.4313137  -0.69072646]] 	 shape: (1, 2)


In [12]:
### batch 사이즈 바꿔보기

x_data = np.array([[h, e, l, l, o], [e, o, l, l, l], [l, l, e, e, l]], dtype=np.float32)

In [13]:
rnn = layers.SimpleRNN(units=hidden_size, return_sequences = True, return_state = True)

output, states = rnn(x_data)

print('x_data : {} \t shape : {}'.format(x_data, x_data.shape))
print('output : {} \t shape : {}'.format(output, output.shape))
print('states: {} \t shape: {}'.format(states, states.shape))

x_data : [[[1. 0. 0. 0.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]]

 [[0. 1. 0. 0.]
  [0. 0. 0. 1.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]
  [0. 0. 1. 0.]]] 	 shape : (3, 5, 4)
output : [[[-0.12818007  0.04968958]
  [-0.12667213 -0.66418386]
  [ 0.13962443  0.6124754 ]
  [-0.57845664 -0.30636993]
  [ 0.4918073   0.28548098]]

 [[-0.00650635 -0.6992318 ]
  [ 0.83611715  0.11154988]
  [ 0.17084251 -0.4801117 ]
  [ 0.19048181  0.35132104]
  [-0.4037878  -0.18411745]]

 [[-0.2763234   0.18424074]
  [-0.5391817   0.27078918]
  [-0.5094133  -0.55813867]
  [ 0.08890549 -0.11569378]
  [-0.13709441  0.1911195 ]]] 	 shape : (3, 5, 2)
states: [[ 0.4918073   0.28548098]
 [-0.4037878  -0.18411745]
 [-0.13709441  0.1911195 ]] 	 shape: (3, 2)


In [14]:
# 한글도 해보자

idx2char = ['토', '마', '를', '먹', '자']
x_data = [[0, 0, 1, 2, 4, 3]] # 토 토 마 를 자 먹
y_data = [[0, 1, 0, 2, 3, 4]] # 토 마 토 를 먹 자

input_dim = 5
sequence_len = 6
learning_rate = 0.1

# onehot
x_one_hot = tf.keras.utils.to_categorical(x_data, num_classes=input_dim)
y_one_hot = tf.keras.utils.to_categorical(y_data, num_classes=input_dim)

In [17]:
from tensorflow.keras.models import Model

model = tf.keras.Sequential()
cell = layers.SimpleRNNCell(units=input_dim, input_shape=(sequence_len, input_dim))

model.add(layers.RNN(cell=cell, return_sequences=True, return_state=False, input_shape=(sequence_len, input_dim)))

# 모든 타임 스텝에서 출력을 Dense 층에 적용하는 역할을 한다. 쉽게 말해 매 스텝마다 FC가 연결된 것처럼 이해할 수 있다. 각 타임 스텝을 별개의 샘플처럼 다루도록 입력의 크기를 바꾸어 이를 효과적으로 수행한다. 
model.add(layers.TimeDistributed(layers.Dense(units = input_dim, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['acc'])

In [18]:
model.summary()
model.fit(x_one_hot, y_one_hot, epochs=10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
rnn_1 (RNN)                  (None, 6, 5)              55        
_________________________________________________________________
time_distributed (TimeDistri (None, 6, 5)              30        
Total params: 85
Trainable params: 85
Non-trainable params: 0
_________________________________________________________________


2021-07-30 09:28:41.929976: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd72ad4f820>

## imdb

In [20]:
import tensorflow as tf
import tensorflow_datasets as tfds

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [21]:
import numpy as np

train_data, test_data = imdb['train'], imdb['test']
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels= []

# numpy array로 바꾼 후 utf-8로 디코딩
for s, l in train_data:
    training_sentences.append(s.numpy().decode('utf8'))
    training_labels.append(l.numpy())

for s, l in test_data:
    testing_sentences.append(s.numpy().decode('utf8'))
    testing_labels.append(l.numpy())

In [22]:
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

In [23]:
vocab_size = 10000
embedding_dim = 200
max_length = 120
trunc_type = 'post'
oov_tok = '<oov>'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen = max_length, truncating = trunc_type)

# testing은 알지 못하는 자료이니 fit_on_text 하면 안 됨!
testing_sequence = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequence, maxlen = max_length, truncating = trunc_type)

In [24]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

decode_review(padded[3])

'? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <oov> into a big arm chair and <oov> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <oov> to cross no dangerous waters just a warm and witty <oov> through new york life at its best a family film in every sense and one that deserves the praise it received'

In [33]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
                             tf.keras.layers.SimpleRNN(units=32, input_shape=(max_length, embedding_dim), return_sequences=True),
                             layers.TimeDistributed(layers.Dense(128, activation='relu')),
                            tf.keras.layers.Dense(1, activation='sigmoid')])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 120, 200)          2000000   
_________________________________________________________________
simple_rnn_8 (SimpleRNN)     (None, 120, 32)           7456      
_________________________________________________________________
time_distributed_3 (TimeDist (None, 120, 128)          4224      
_________________________________________________________________
dense_12 (Dense)             (None, 120, 1)            129       
Total params: 2,011,809
Trainable params: 2,011,809
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['acc'])

In [35]:
NUM_EPOCHS = 5

history = model.fit(padded, training_labels, validation_data=(testing_padded, testing_labels),
                    epochs=NUM_EPOCHS, batch_size=100)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Subwords

In [42]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import warnings
warnings.filterwarnings('ignore')

In [43]:
imdb, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)



In [44]:
train_data, test_data = imdb['train'], imdb['test']

In [45]:
for item in train_data:
    print(item)
    break

(<tf.Tensor: shape=(163,), dtype=int64, numpy=
array([  62,   18,   41,  604,  927,   65,    3,  644, 7968,   21,   35,
       5096,   36,   11,   43, 2948, 5240,  102,   50,  681, 7862, 1244,
          3, 3266,   29,  122,  640,    2,   26,   14,  279,  438,   35,
         79,  349,  384,   11, 1991,    3,  492,   79,  122,  188,  117,
         33, 4047, 4531,   14,   65, 7968,    8, 1819, 3947,    3,   62,
         27,    9,   41,  577, 5044, 2629, 2552, 7193, 7961, 3642,    3,
         19,  107, 3903,  225,   85,  198,   72,    1, 1512,  738, 2347,
        102, 6245,    8,   85,  308,   79, 6936, 7961,   23, 4981, 8044,
          3, 6429, 7961, 1141, 1335, 1848, 4848,   55, 3601, 4217, 8050,
          2,    5,   59, 3831, 1484, 8040, 7974,  174, 5773,   22, 5240,
        102,   18,  247,   26,    4, 3903, 1612, 3902,  291,   11,    4,
         27,   13,   18, 4092, 4008, 7961,    6,  119,  213, 2774,    3,
         12,  258, 2306,   13,   91,   29,  171,   52,  229,    2, 1245,
    

2021-07-30 10:24:12.523629: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [46]:
# subword 인코딩해서 가져오기
tokenizer = info.features['text'].encoder
print(tokenizer.subwords[:10])

['the_', ', ', '. ', 'a_', 'and_', 'of_', 'to_', 's_', 'is_', 'br']


In [51]:
string = 'TensorFlow, from basics to mastery'

tokenized_string = tokenizer.encode(string)
print(tokenized_string)

[6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]


In [52]:
original_string = tokenizer.decode(tokenized_string)
print(original_string)

TensorFlow, from basics to mastery


In [53]:
for token in tokenized_string:
    print('{} -> {}'.format(token, tokenizer.decode([token])))

6307 -> Ten
2327 -> sor
4043 -> Fl
2120 -> ow
2 -> , 
48 -> from 
4249 -> basi
4429 -> cs 
7 -> to 
2652 -> master
8050 -> y


In [59]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_data.shuffle(BUFFER_SIZE)
# padded_batch? batch_size만큼 패딩 해서 가져오는 것
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_data.padded_batch(BATCH_SIZE)

In [89]:
embedding_dim = 16
max_length = 120
model = tf.keras.Sequential([tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
#                              tf.keras.layers.SimpleRNN(32),
#                              tf.keras.layers.LSTM(32),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                             tf.keras.layers.Dense(64, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')

])

In [90]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, None, 16)          130960    
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 128)         41472     
_________________________________________________________________
bidirectional_6 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_31 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 65        
Total params: 217,873
Trainable params: 217,873
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 매우 느림 (중단)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['acc'])
NUM_EPOCHS = 10

history = model.fit(train_dataset, validation_data=(test_dataset),
                    epochs=NUM_EPOCHS, batch_size=500)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_' + string])
    plt.show()

plot_graphs(history, 'acc')
plot_graphs(history, 'loss')

In [95]:
a = ["a","b","c"]
a.index("a")

0