In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [2]:
df = pd.read_csv(r"E:\AI\IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming t...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
train_data = df.sample(int(len(df)*0.75))
test_data = df.sample(int(len(df)*0.25))

In [5]:
review = train_data["review"]
review.head()

27624    Charlotte's deadly beauty and lethal kicks mak...
30209    War Inc. is a funny but strange film. The acto...
15884    I was so surprised by how great The Man In The...
33907    This is quite possibly the worst sequel ever m...
47789    In 1932, Humphrey Bogart was a relative unknow...
Name: review, dtype: object

In [6]:
sentiment = train_data["sentiment"]
sentiment.head()

27624    negative
30209    positive
15884    positive
33907    negative
47789    negative
Name: sentiment, dtype: object

In [7]:
tokens = Tokenizer()

In [8]:
tokens.fit_on_texts(review)

In [9]:
vocab = len(tokens.word_index)+1

In [10]:
vocab

109430

In [11]:
seq = tokens.texts_to_sequences(review)

In [12]:
seq

[[28768,
  2416,
  922,
  2,
  5861,
  3364,
  93,
  1,
  14,
  11287,
  70,
  7782,
  25,
  911,
  1,
  61,
  6,
  33,
  323,
  2,
  28768,
  107,
  1489,
  7,
  138,
  3,
  5738,
  94,
  11,
  21,
  576,
  42,
  63303,
  1,
  270,
  32,
  65,
  191,
  4,
  15,
  10,
  14,
  9,
  640,
  1804,
  96,
  369,
  11,
  31,
  4229,
  270,
  2937,
  17,
  9,
  1164,
  8,
  148,
  13,
  15,
  1,
  371,
  4,
  1,
  63304,
  68,
  487,
  131,
  32,
  324,
  150,
  499,
  51,
  68,
  260,
  1,
  1749,
  141,
  25,
  265,
  19,
  1,
  323,
  227,
  10,
  14,
  1828,
  67,
  4,
  3251,
  521,
  230,
  116,
  8868,
  19361,
  1489,
  7,
  1928,
  94,
  236,
  57,
  65,
  77,
  1,
  60,
  26,
  34,
  65,
  2601,
  1,
  119,
  294,
  1,
  219,
  14,
  9,
  96,
  196,
  10,
  14,
  230,
  455],
 [298,
  12310,
  6,
  3,
  153,
  17,
  680,
  18,
  1,
  149,
  22,
  1506,
  1,
  18,
  6,
  1506,
  79,
  17,
  9,
  88,
  117,
  84,
  5,
  1680,
  1,
  111,
  9,
  75,
  136,
  81,
  1,
  111,
  300,
  19,

In [13]:
max_length=500
max_length

500

In [14]:
pad_seq = sequence.pad_sequences(seq, maxlen=max_length, padding="post")

In [15]:
labels = np.array([1 if i[0]=="p" else 0 for i in sentiment])

In [16]:
labels

array([0, 1, 1, ..., 1, 1, 1])

In [17]:
test_seq = tokens.texts_to_sequences(test_data["review"])
pad_test_seq = sequence.pad_sequences(test_seq, maxlen=max_length, padding="post")

In [18]:
test_labels = np.array([1 if i[0]=="p" else 0 for i in test_data["sentiment"]])

In [19]:
model = Sequential([
    Embedding(vocab, 50, input_length=max_length),
    SimpleRNN(20),
    Dense(1, activation="sigmoid")    
])

In [20]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [21]:
model.fit(pad_seq, labels, epochs=10, validation_data=(pad_test_seq, test_labels))

Epoch 1/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 143ms/step - accuracy: 0.5077 - loss: 0.6934 - val_accuracy: 0.5275 - val_loss: 0.6814
Epoch 2/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 125ms/step - accuracy: 0.5344 - loss: 0.6829 - val_accuracy: 0.5328 - val_loss: 0.6735
Epoch 3/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 114ms/step - accuracy: 0.5357 - loss: 0.6657 - val_accuracy: 0.5310 - val_loss: 0.6625
Epoch 4/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 106ms/step - accuracy: 0.5376 - loss: 0.6513 - val_accuracy: 0.5378 - val_loss: 0.6561
Epoch 5/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 106ms/step - accuracy: 0.5409 - loss: 0.6536 - val_accuracy: 0.5350 - val_loss: 0.6587
Epoch 6/10
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 105ms/step - accuracy: 0.5358 - loss: 0.6441 - val_accuracy: 0.5365 - val_loss:

<keras.src.callbacks.history.History at 0x264c7e9cc50>

In [22]:
new_review = "Kind of drawn in by the erotic scenes, only to realize this was one of the most amateurish and unbelievable bits of film I've ever seen. Sort of like a high school film project. What was Rosanna Arquette thinking?? And what was with all those stock characters in that bizarre supposed Midwest town? Pretty hard to get involved with this one. No lessons to be learned from it, no brilliant insights, just stilted and quite ridiculous (but lots of skin, if that intrigues you) videotaped nonsense....What was with the bisexual relationship, out of nowhere, after all the heterosexual encounters. And what was with that absurd dance, with everybody playing their stereotyped roles? Give this one a pass, it's like a million other miles of bad, wasted film, money that could have been spent on starving children or Aids in Africa....."

In [23]:
new_seq = tokens.texts_to_sequences([new_review])

In [24]:
new_seq

[[242,
  4,
  1438,
  7,
  30,
  1,
  2492,
  134,
  60,
  5,
  952,
  10,
  12,
  26,
  4,
  1,
  87,
  2249,
  2,
  1344,
  1704,
  4,
  18,
  197,
  122,
  106,
  438,
  4,
  36,
  3,
  295,
  358,
  18,
  1116,
  47,
  12,
  12932,
  7522,
  529,
  2,
  47,
  12,
  16,
  28,
  143,
  1942,
  101,
  7,
  11,
  1174,
  442,
  17625,
  517,
  178,
  252,
  5,
  73,
  583,
  16,
  10,
  26,
  53,
  2982,
  5,
  25,
  1856,
  35,
  8,
  53,
  512,
  6910,
  38,
  4580,
  2,
  176,
  648,
  17,
  743,
  4,
  2372,
  42,
  11,
  15726,
  21,
  18965,
  1899,
  47,
  12,
  16,
  1,
  13692,
  632,
  40,
  4,
  1252,
  98,
  28,
  1,
  10323,
  3353,
  2,
  47,
  12,
  16,
  11,
  1928,
  884,
  16,
  1348,
  392,
  64,
  8413,
  577,
  196,
  10,
  26,
  3,
  1334,
  43,
  36,
  3,
  1442,
  76,
  1960,
  4,
  74,
  1016,
  18,
  287,
  11,
  96,
  24,
  77,
  973,
  19,
  9831,
  457,
  37,
  4996,
  7,
  2440]]

In [25]:
new_padded_seq = sequence.pad_sequences(new_seq, maxlen=max_length, padding="post")

In [26]:
new_padded_seq

array([[  242,     4,  1438,     7,    30,     1,  2492,   134,    60,
            5,   952,    10,    12,    26,     4,     1,    87,  2249,
            2,  1344,  1704,     4,    18,   197,   122,   106,   438,
            4,    36,     3,   295,   358,    18,  1116,    47,    12,
        12932,  7522,   529,     2,    47,    12,    16,    28,   143,
         1942,   101,     7,    11,  1174,   442, 17625,   517,   178,
          252,     5,    73,   583,    16,    10,    26,    53,  2982,
            5,    25,  1856,    35,     8,    53,   512,  6910,    38,
         4580,     2,   176,   648,    17,   743,     4,  2372,    42,
           11, 15726,    21, 18965,  1899,    47,    12,    16,     1,
        13692,   632,    40,     4,  1252,    98,    28,     1, 10323,
         3353,     2,    47,    12,    16,    11,  1928,   884,    16,
         1348,   392,    64,  8413,   577,   196,    10,    26,     3,
         1334,    43,    36,     3,  1442,    76,  1960,     4,    74,
      

In [27]:
prob = model.predict(new_padded_seq)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step


In [28]:
if prob>=0.5:
    print("Positive")
else:
    print("Negative")

Negative


In [29]:
def data_gen(text):
    a = tokens.texts_to_sequences([text])
    pad_a = sequence.pad_sequences(a, maxlen=max_length, padding="post")
    return pad_a

In [30]:
t = "My guess would be this was originally going to be at least two parts, and thus at least a quarter longer, because otherwise how can one explain its confused, abbreviated storyline. I was never completely lost, but I was often partially lost and usually unclear on character motivation. The movie feels as though joining plot points were dropped to squeeze it into its time slot.  If it were longer, it might make more sense, but it still wouldn't be much good. The movie's most interesting idea is of the war between Zeus and Hera as being a war between the male and female, but the movie drops the ball on this, making Hera's followers fairly horrible while not being clear on what Zeus' followers do or believe. The movie is also interesting because you don't see the gods and there's no real certainty that they exist. So it's got a couple of intriguing ideas, but it doesn't do anything useful with them.  Bad dialog, cardboard characters, and one interesting scene involving Hercules and his three antagonistic sons. Not unwatchable but also not worth watching."

In [31]:
dummy = data_gen(t)

In [32]:
dummy

array([[   54,   463,    58,    25,    10,    12,  1842,   163,     5,
           25,    29,   221,   104,   514,     2,  1397,    29,   221,
            3,  6483,  1173,    83,   911,    84,    66,    26,  1225,
           89,  1447, 24861,   753,     9,    12,   112,   330,   423,
           17,     9,    12,   404,  5713,   423,     2,   647,  7203,
           19,   107,  3651,     1,    14,   782,    13,   151,  7271,
          111,   818,    69,  3480,     5,  8935,     8,    81,    89,
           56,  9723,    42,     8,    69,  1173,     8,   229,    93,
           49,   281,    17,     8,   128,   580,    25,    71,    48,
            1,  1418,    87,   216,   320,     6,     4,     1,   298,
          199, 26101,     2, 20661,    13,   108,     3,   298,   199,
            1,   930,     2,   677,    17,     1,    14,  4322,     1,
         1936,    19,    10,   232, 82485,  7819,   944,   480,   135,
           20,   108,   756,    19,    47, 82486,  7819,    78,    37,
      

In [33]:
def predict(var):
    prob = model.predict(new_padded_seq)
    if prob>=0.5:
        return "Positive"
    else:
        return "Negative"

In [34]:
predict(dummy)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


'Negative'

In [35]:
predict(data_gen("the movie was great"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


'Negative'

In [36]:
t = "If anyone is wondering why no one makes movies like they used to, with conversation, character and a simple theme of friendship struggling to evolve into something new, better and different, those folks need to take in this film and see top notch writing, directing, and acting that melds into a wonderful evening of observation on how things used to be in Italy and England. Other days, other times funneled into a terrific comedy of entertainment, made in 1992 with Alfred Molina, Joan Plowright, Polly Walker, Josie Lawrence, Jim Broadbent, Miranda Richardson, and Michael Kitchens in the major roles. Under the brush stroke direction of Mike Newell, these actors accomplish vividly memorable performances that are photographed with a sublimely subtle painter's eye. Reminiscent of the theatrical bedroom farce of the turn of the century, this film might be called a friendship farce that becomes a worthwhile experience in the growth of the romantic nature within each character, and the viewer, too. An artistic telegram on the importance of caring about those around us."

In [37]:
predict(data_gen(t))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'Negative'

In [38]:
# Try to create original max length 
# add more RNN neuron (50) and Embedding(200)
# epoch increase (20)

In [39]:
max_length=len(max(seq, key=len))
max_length

2246

In [40]:
model = Sequential([
    Embedding(vocab, 200, input_length=max_length),
    SimpleRNN(50),
    Dense(1, activation="sigmoid")
])

In [41]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [42]:
model.fit(pad_seq, labels, epochs=20, validation_data=(pad_test_seq, test_labels))

Epoch 1/20
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 255ms/step - accuracy: 0.5017 - loss: 0.6950 - val_accuracy: 0.5259 - val_loss: 0.6864
Epoch 2/20
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 269ms/step - accuracy: 0.5197 - loss: 0.6866 - val_accuracy: 0.5280 - val_loss: 0.6733
Epoch 3/20
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m314s[0m 268ms/step - accuracy: 0.5329 - loss: 0.6681 - val_accuracy: 0.5326 - val_loss: 0.6613
Epoch 4/20
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 267ms/step - accuracy: 0.5412 - loss: 0.6520 - val_accuracy: 0.5358 - val_loss: 0.6585
Epoch 5/20
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 267ms/step - accuracy: 0.5361 - loss: 0.6450 - val_accuracy: 0.5283 - val_loss: 0.6642
Epoch 6/20
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 269ms/step - accuracy: 0.5358 - loss: 0.6523 - val_accuracy: 0.5346 - val_loss:

<keras.src.callbacks.history.History at 0x264cc896450>

In [43]:
predict(data_gen(t))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step


'Positive'