In [4]:
import tensorflow as tf
import os
import numpy as np
from tensorflow import keras
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras_preprocessing.sequence import pad_sequences
from keras.utils import pad_sequences


VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

In [5]:
train_data = keras.utils.pad_sequences(train_data, MAXLEN) # Apabila ada review yang lebih dari 250 kata, maka kata yang berlebih akan dihapus 
test_data = keras.utils.pad_sequences(test_data, MAXLEN) # Apabila ada review yang kurang dari 250 kata, tambahkan 0 hingga sampai 250
train_data[9]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     1,    14,    20,    47,   111,   439,
        3445,    19,

**Membuat Model**

In [6]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(VOCAB_SIZE, 32),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ]
)

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          2834688   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


**Training**

In [8]:
model.compile(loss="binary_crossentropy", optimizer = "rmsprop", metrics = ['acc']) # Binary Cross Entropy digunakan untuk melihat output kita sejauh mana apakah 0 atau 1
history = model.fit(train_data, train_labels, epochs = 10, validation_split=0.2) # Validation split berarti kita akan menggunakan 20% data saja 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
hasil = model.evaluate(test_data, test_labels) # 80%
print(hasil)

[0.7779424786567688, 0.8042799830436707]


**Prediksi**

In [11]:
word_index = imdb.get_word_index()

def encode_text(kata):
  tokens = keras.preprocessing.text.text_to_word_sequence(kata)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return keras.utils.pad_sequences([tokens], MAXLEN)[0]

kata = "that movie was just amazing, so amazing"
encoded = encode_text(kata)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [16]:
# mengubah angka menjadi kalimat sesuai inputan diatas

reverse = {value: key for (key,value) in word_index.items()}

def decode_integers(integers):
  pad = 0 
  kata = ""
  for num in integers:
    if num != pad:
      kata += reverse[num] + " "
  return kata[:-1]

print(decode_integers(encoded))

that movie was just amazing so amazing


In [17]:
def predict(kata):
  encoded_text = encode_text(kata)
  pred =np.zeros((1,250))
  pred[0] = encoded_text
  hasil = model.predict(pred)
  print(hasil[0])

review_positif = "that movie was awesome, i enjoyed it very much, really recommended"
predict(review_positif)

review_negatif = "that movie was terrible, i wasted my time watching it, should have just watched something else"
predict(review_negatif)

[0.7454071]
[0.5135381]
