<a href="https://colab.research.google.com/github/RogerHeederer/DeepLearning_Heo/blob/master/RNN_SentenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
from IPython.display import Image

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [3]:
tf.random.set_seed(1)
np.random.seed(1)

In [4]:
movie_reviews = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the worst movie ever', 'sentiment': 'negative'}
    ]
df = pd.DataFrame(movie_reviews)

In [5]:
df

Unnamed: 0,review,sentiment
0,this is the best movie,positive
1,i recommend you watch this movie,positive
2,it was waste of money and time,negative
3,the worst movie ever,negative


In [6]:
def get_vocab2int(df):
  d={}
  vocab = set()
  df['review'].str.split().apply(vocab.update)
  for idx, word in enumerate(vocab):
    d[word] = idx
  return d

vocab2_int = get_vocab2int(df)
vocab_size = len(vocab2_int)

In [25]:
vocab2_int, vocab_size

({'and': 10,
  'best': 5,
  'ever': 0,
  'i': 16,
  'is': 14,
  'it': 15,
  'money': 2,
  'movie': 13,
  'of': 1,
  'recommend': 9,
  'the': 8,
  'this': 7,
  'time': 3,
  'was': 17,
  'waste': 12,
  'watch': 6,
  'worst': 4,
  'you': 11},
 18)

In [9]:
reviews = df['review'].tolist()

In [10]:
reviews

['this is the best movie',
 'i recommend you watch this movie',
 'it was waste of money and time',
 'the worst movie ever']

In [11]:
encoded_reviews = []
for review in reviews:
  tokens = review.split(" ")
  review_encoding = []
  for token in tokens:
    review_encoding.append(vocab2_int[token])
  encoded_reviews.append(review_encoding)

In [12]:
encoded_reviews

[[7, 14, 8, 5, 13],
 [16, 9, 11, 6, 7, 13],
 [15, 17, 12, 1, 2, 10, 3],
 [8, 4, 13, 0]]

In [14]:
def get_max_length(df):
  max_length = 0
  for row in df['review']:
    if len(row.split(" ")) > max_length:
      max_length = len(row.split(" "))
  return max_length

max_length = get_max_length(df)

In [15]:
max_length

7

In [16]:
# max_length 길이에 맞게, 리뷰가 짧은 array에는 0으로 패딩해서 길이 맞춰준다
padded_reviews_encoding = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')

In [32]:
padded_reviews_encoding

array([[ 7, 14,  8,  5, 13,  0,  0],
       [16,  9, 11,  6,  7, 13,  0],
       [15, 17, 12,  1,  2, 10,  3],
       [ 8,  4, 13,  0,  0,  0,  0]], dtype=int32)

In [21]:
sentiments = df['sentiment'].tolist()
sentiments

['positive', 'positive', 'negative', 'negative']

In [22]:
def sentiment_encode(sentiment):
  if sentiment == 'positive':
    return [1,0]
  else:
    return [0,1]

encoded_sentiment = [sentiment_encode(sentiment) for sentiment in sentiments]

In [24]:
encoded_sentiment

[[1, 0], [1, 0], [0, 1], [0, 1]]

In [36]:
model = Sequential()
#vocab_size = 18, input_length = 7
model.add(Embedding(vocab_size, 3, input_length=max_length))
model.add(SimpleRNN(32))
model.add(Dense(2, activation='softmax'))

In [37]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [38]:
train_X = np.array(padded_reviews_encoding)
train_Y = np.array(encoded_sentiment)

In [39]:
train_X

array([[ 7, 14,  8,  5, 13,  0,  0],
       [16,  9, 11,  6,  7, 13,  0],
       [15, 17, 12,  1,  2, 10,  3],
       [ 8,  4, 13,  0,  0,  0,  0]], dtype=int32)

In [40]:
train_Y

array([[1, 0],
       [1, 0],
       [0, 1],
       [0, 1]])

In [41]:
print('Train...')
model.fit(train_X, train_Y,epochs=50)

Train...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f320eaacef0>

In [42]:
score, acc = model.evaluate(train_X, train_Y, verbose=2)
print('Train score:', score)
print('Train accuracy:', acc)

1/1 - 0s - loss: 0.0103 - accuracy: 1.0000
Train score: 0.010290705598890781
Train accuracy: 1.0
