In [5]:
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, LSTM
from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk import word_tokenize
nltk.download('punkt')

import numpy as np

[nltk_data] Downloading package punkt to C:\Users\Oscar
[nltk_data]     Pang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
imdb = keras.datasets.imdb

# load IMDB dataset as lists of integers
words = 20000
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=words)

In [7]:
# Cut texts after this number of words 
max_len = 100

# pad the arrays so they all have the same length
train_data = keras.preprocessing.sequence.pad_sequences(train_data, maxlen=max_len)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, maxlen=max_len)

In [9]:
# Create a model
embedding_dimension = 16
model = Sequential()

# The shape is: (samples, vocab_size, embedding_dimension)
model.add(Embedding(words, embedding_dimension, input_length=max_len))

# Add 2 bidirectional LSTMS
model.add(keras.layers.Bidirectional(LSTM(10, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(keras.layers.Bidirectional(LSTM(10, dropout=0.2, recurrent_dropout=0.2)))

# Add a classifier on top
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# Train the model
model.fit(
    train_data,
    train_labels,
    epochs=5,
    batch_size=50,
    validation_split=0.2
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x18d588211e0>

In [11]:
sentence = "The movie was entertaining enough, but the elements themselves were horrible. Save your money. this was a disappointment and it is not looking good for the MCU."

word2index = imdb.get_word_index()

# tokenize the sentence
tokened = word_tokenize(sentence)
test = []

# for each word in the sentence, get the corresponding index
for word in tokened:
    if (word not in word2index):
      continue
    if (word2index[word] > words):
      continue
    test.append(word2index[word] + 3)

# pad sequences taking consideration of the max length
test = keras.preprocessing.sequence.pad_sequences([test], maxlen=max_len) 

[[0.05290915]]


In [None]:
if (model.predict(test) > 0.5):
    print("Positive")
else:
    print("Negative")