In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
# !pip install nltk
import nltk
nltk.download('stopwords')
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TextVectorization
from nltk.corpus import stopwords 

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import LSTM,Dense,Activation,Bidirectional

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

english_stops = set(stopwords.words('english'))
def load_dataset():
    df = pd.read_csv('D://Notebooks//Datasets//IMDB Dataset.csv//IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

No need to down or oversample

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data, test_size = 0.02, random_state = 40)

In [4]:
X_test.shape

(1000,)

In [4]:
def get_max_length():
    review_length = []
    for review in X_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))
get_max_length()

130

In [6]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(X_train)
x_train = token.texts_to_sequences(X_train)
x_test = token.texts_to_sequences(X_test)
# x_test

In [7]:
max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   16  1386     1 ...     1   285   123]
 [  116   706    21 ...     0     0     0]
 [    1    38     3 ...     0     0     0]
 ...
 [   58     6    61 ...     0     0     0]
 [22099 18075   177 ...     0     0     0]
 [  109   104    19 ...     0     0     0]] 

Encoded X Test
 [[   1  335  298 ... 1903  357   10]
 [   2   41   89 ...    0    0    0]
 [ 884   97  902 ...    0    0    0]
 ...
 [   9  121  815 ...    0    0    0]
 [  23    1   90 ...    0    0    0]
 [   8  633    5 ...    0    0    0]] 

Maximum review length:  130


In [8]:
model = Sequential()
model.add(Embedding(total_words, 32, input_length = max_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 130, 32)           3215808   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 3,240,705
Trainable params: 3,240,705
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [11]:
from tensorflow.keras.callbacks import ModelCheckpoint
model.fit(x_train, Y_train, batch_size = 128, epochs = 5, callbacks=[ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1)],
validation_data=(x_test, Y_test))

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.77533, saving model to models\LSTM.h5
Epoch 2/5

Epoch 00002: accuracy improved from 0.77533 to 0.92751, saving model to models\LSTM.h5
Epoch 3/5

Epoch 00003: accuracy improved from 0.92751 to 0.96149, saving model to models\LSTM.h5
Epoch 4/5

Epoch 00004: accuracy improved from 0.96149 to 0.97712, saving model to models\LSTM.h5
Epoch 5/5

Epoch 00005: accuracy improved from 0.97712 to 0.98351, saving model to models\LSTM.h5


<keras.callbacks.History at 0x21d3936fbb0>

In [22]:
review = str(input('Movie Review: '))
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Movie Review:  the movie has the worst direction and screenplay.....The comedy scenes are a headache


Filtered:  ['movie worst direction screenplaythe comedy scenes headache']


In [23]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')

In [24]:
def predict_result(n):
    if n >= 0.6:
        return 'positive'
    else:
        return 'negative'

In [25]:
from keras.models import load_model
loaded_model = load_model('models/LSTM.h5')
result = loaded_model.predict(tokenize_words)
print(result)

[[0.00232938]]


In [26]:
predict_result(result)

'negative'