In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Dataset of 50K Movie Reviews/IMDB Dataset.csv')

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [3]:
import nltk
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def load_dataset():
    df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Dataset of 50K Movie Reviews/IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
18444    [i, remember, watching, child, part, children,...
1810     [i, never, seen, comedy, much, chore, sit, thr...
1067     [i, never, laughed, giggled, much, life, the, ...
920      [first, foremost, i, would, like, say, watched...
9935     [robot, jox, great, little, film, ok, sets, ba...
                               ...                        
21831    [based, true, story, fbis, hunt, responsible, ...
32150    [this, delectable, fusion, new, age, babble, l...
18169    [i, understand, many, comments, seem, indicate...
1821     [at, beginning, film, might, double, check, dv...
17024    [the, kind, b, movies, schlocky, yet, much, fu...
Name: review, Length: 40000, dtype: object 

25589    [mediocre, best, slow, probably, entertaining,...
10159    [the, disturbing, thing, film, load, hogwash, ...
24765    [despite, potentially, fascinating, premise, s...
27487    [after, looking, monkeys, oops, apes, one, hou...
41438    [watch, movie, see, shahrukh, khan, say, i, lo...
 

In [6]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [7]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   1  284   66 ...    0    0    0]
 [   1   42   38 ...    0    0    0]
 [   1   42 1378 ...    0    0    0]
 ...
 [   1  290   37 ...    0    0    0]
 [ 285  359    4 ...    2 9950 4296]
 [   2  147  377 ...    0    0    0]] 

Encoded X Test
 [[ 1491    45   465 ...     0     0     0]
 [    2  1115    65 ...     0     0     0]
 [  363  4383  1319 ...     0     0     0]
 ...
 [    2   785  1453 ...     0     0     0]
 [    1  2748     3 ...     0     0     0]
 [    8     4 14437 ...     0     0     0]] 

Maximum review length:  130


In [11]:
#Run it only for AdamW
!pip install tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard>=2.7
  Downloading typeguard-3.0.1-py3-none-any.whl (30 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.19.0 typeguard-3.0.1


In [13]:
#Run it only for AdamW
from tensorflow_addons.optimizers import AdamW

In [16]:
#Run it only for AdamW

from keras.optimizers import Adam
#from keras.utils.generic_utils import get_custom_objects
from keras.utils import get_custom_objects


# Define the AdamW optimizer
class AdamW(Adam):
    def __init__(self, *args, **kwargs):
        super(AdamW, self).__init__(*args, **kwargs)

    def get_updates(self, loss, params):
        lr = self.lr
        beta_1 = self.beta_1
        beta_2 = self.beta_2
        epsilon = self.epsilon
        t = K.cast(self.iterations + 1, K.floatx())
        wd = self.weight_decay
        wd_t = wd * t

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v, gg in zip(params, loss, ms, vs, gs):
            gg_t = gg + g
            m_t = beta_1 * m + (1. - beta_1) * g
            v_t = beta_2 * v + (1. - beta_2) * K.square(g)
            p_t = p - lr * (m_t / (K.sqrt(v_t) + epsilon) + wd_t * p)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            self.updates.append(K.update(p, p_t))
            self.updates.append(K.update(gg, gg_t))

        self.updates.append(K.update(self.iterations, self.iterations + 1))
        return self.updates

# Register the AdamW optimizer with Keras
get_custom_objects().update({'AdamW': AdamW})



In [17]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'AdamW', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 130, 32)           2957600   
                                                                 
 lstm_3 (LSTM)               (None, 64)                24832     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,982,497
Trainable params: 2,982,497
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [19]:
import time

start_time = time.time()

model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

end_time = time.time()
total_time = end_time - start_time
print("Total training time:", total_time, "seconds")

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.75533, saving model to models/LSTM.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.75533 to 0.92567, saving model to models/LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.92567 to 0.96293, saving model to models/LSTM.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.96293 to 0.97613, saving model to models/LSTM.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.97613 to 0.98663, saving model to models/LSTM.h5
Total training time: 444.78208470344543 seconds


In [20]:
# Get predicted probabilities from the model
y_pred = model.predict(x_test, batch_size=128)

# Set a custom threshold
threshold = 0.3

# Convert probabilities to class labels based on threshold
y_pred_classes = (y_pred > threshold).astype(int)

# Calculate accuracy
true = 0
for i, y in enumerate(y_test):
    if y == y_pred_classes[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred_classes) - true))
print('Accuracy: {}'.format(true/len(y_pred_classes)*100))


Correct Prediction: 8682
Wrong Prediction: 1318
Accuracy: 86.82


In [21]:
loaded_model = load_model('models/LSTM.h5')

In [22]:
review = str(input('Movie Review: '))

Movie Review: Nothing was typical about this. Everything was beautifully done in this movie, the story, the flow, the scenario, everything. I highly recommend it for mystery lovers, for anyone who wants to watch a good movie!


In [23]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  Nothing was typical about this Everything was beautifully done in this movie the story the flow the scenario everything I highly recommend it for mystery lovers for anyone who wants to watch a good movie
Filtered:  ['nothing typical everything beautifully done movie story flow scenario everything i highly recommend mystery lovers anyone wants watch good movie']


In [24]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  78  696  173 1223  128    3   13 2725 2622  173    1  450  282  678
  1695  152  399   34    9    3    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]]


In [25]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.99453264]]


In [26]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
