<a href="https://colab.research.google.com/github/SeongilHeo/hufs_ai_camp/blob/master/Day9_2_rnn_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification with an RNN

In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

## 텐서플로와 다른 라이브러리 임포트

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

from tensorflow.keras.datasets import imdb

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# 런타임에서 할당하는데 필요한 양만큼의 GPU 메모리를 할당
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    print(e)

#### 데이터 로딩

In [None]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

In [None]:
print('훈련용 리뷰 개수 : {}'.format(len(x_train)))
print('테스트용 리뷰 개수 : {}'.format(len(x_test)))
num_classes = max(y_train) + 1
print('카테고리 : {}'.format(num_classes))

In [None]:
x_train.shape

In [None]:
print(x_train[0])
print(y_train[0])

#### 리뷰의 통계정보 확인

In [None]:
print('리뷰의 최대 길이 : {}'.format(max(len(l) for l in x_train)))
print('리뷰의 평균 길이 : {}'.format(sum(map(len, x_train))/len(x_train)))

plt.hist([len(s) for s in x_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("각 레이블에 대한 빈도수:")
print(np.asarray((unique_elements, counts_elements)))

In [None]:
word_to_index = imdb.get_word_index()
index_to_word={}
for key, value in word_to_index.items():
    index_to_word[value] = key

In [None]:
print('빈도수 상위 1번 단어 : {}'.format(index_to_word[1]))
print('빈도수 상위 3941번 단어 : {}'.format(index_to_word[3941]))

In [None]:
print(' '.join([index_to_word[X] for X in x_train[0]]))

### 모델 훈련 및 검증

#### 훈련을 위한 데이터 선택 및 전처리

In [None]:
TRAIN_VOCABULARY=5000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=TRAIN_VOCABULARY)

In [None]:
max_time_steps=500
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_time_steps)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_time_steps)

#### 모델 정의

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.LSTM(rnn_units),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  return model

#### 모델 생성

In [None]:
embedding_dim = 120
rnn_units = 120
BATCH_SIZE=64

model = build_model(
  vocab_size = TRAIN_VOCABULARY,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [None]:
model.summary()

#### 모델 훈련 설정

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy']) 

#### 모델 훈련

In [None]:
EPOCHS = 10
history = model.fit(x_train, y_train, epochs=EPOCHS, validation_split=0.1)

#### 모델 평가

In [None]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc)) 

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()
    
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

#### 모델 테스트

In [None]:
def sample_predict(pred_text):
  list_of_words = tf.keras.preprocessing.text.text_to_word_sequence(
                    pred_text,
                    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                    lower=True,
                    split=' '
                )


  encoded_sample = [word_to_index[x] for x in list_of_words]
  x_pred = tf.expand_dims(encoded_sample, 0)

  predictions = model.predict(x_pred)

  return (predictions)

In [None]:
sample_pred_text = ('The movie was cool. The animation and the graphics '
                    'were out of this world. I would recommend this movie.')
predictions = sample_predict(sample_pred_text)
print(predictions)