In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout,Conv1D,MaxPooling1D # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [None]:

english_stops = stopwords.words('english')

In [None]:
def load_dataset():
    df = pd.read_csv('/IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words and spliting
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
41810    [i, huge, fan, original, assault, on, precinct...
22559    [george, c, scott, gives, finest, funniest, wo...
25110    [excellent, movie, big, media, firm, goings, c...
31438    [i, saw, film, sneak, preview, delightful, the...
45541    [what, indonesian, musical, movies, never, i, ...
                               ...                        
47974    [ahem, i, think, i, one, saying, yes, i, lil, ...
1814     [what, fun, bucketfuls, good, humor, terrific,...
4346     [like, i, said, hidden, surprise, it, well, wr...
49645    [i, pleasantly, surprised, find, movie, showin...
23933    [i, never, heard, aussie, horror, prior, micha...
Name: review, Length: 40000, dtype: object 

31072                          [what, script, story, mess]
11644    [it, best, movie, acting, i, ever, seen, all, ...
2707     [in, order, describe, seriously, wrong, movie,...
35758    [this, movie, deviated, bible, fell, bar, movi...
12947    [given, christopher, nolan, string, successful...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   1  554  233 ...    0    0    0]
 [ 644  898  971 ...    0    0    0]
 [ 225    3   96 ...    0    0    0]
 ...
 [   6    1  208 ...    0    0    0]
 [   1 3662  676 ...    0    0    0]
 [   1   42  461 ...  170  591 2068]] 

Encoded X Test
 [[  106   135    14 ...     0     0     0]
 [    7    46     3 ...     0     0     0]
 [   49   531  1655 ...  5183  2480   149]
 ...
 [35233   147   460 ...     0     0     0]
 [  144     5   316 ...     0     0     0]
 [  960   960     1 ...  3478  4722     3]] 

Maximum review length:  130


In [None]:
EMBED_DIM = 32
LSTM_OUT = 64
model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2957216   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2982113 (11.38 MB)
Trainable params: 2982113 (11.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
model.fit(x_train, y_train,validation_data=(x_test, y_test), batch_size = 128, epochs = 2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7982186a3760>

In [None]:
y_pred =  (model.predict(x_test) > 0.7).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8557
Wrong Prediction: 1443
Accuracy: 85.57000000000001


In [None]:
model.save('LSTM_IMDB_T1.keras')
loaded_model = load_model('LSTM_IMDB_T1.keras')


In [None]:
loaded_model = load_model('LSTM_IMDB_T1.keras')


In [2]:
review = "Ray's \"Pather Panchali,\" the first of his unforgettable \"Apu Trilogy,\" is a remarkable film experience. The acting is strong, the direction and script, sure, and the total work, eloquent and moving. A film which one can return to again and again, and each time one can discover new elements. This is a staple of my video library, along with Ray's other two films which complete the trio, \"Aparajito\" and \"The World of Apu.\" I have watched the trilogy in a continuous sitting on two occasions, and the experience was emotionally overwhelming."
print("Pather Panchali review:")
print(review)

Pather Panchali review:
Ray's "Pather Panchali," the first of his unforgettable "Apu Trilogy," is a remarkable film experience. The acting is strong, the direction and script, sure, and the total work, eloquent and moving. A film which one can return to again and again, and each time one can discover new elements. This is a staple of my video library, along with Ray's other two films which complete the trio, "Aparajito" and "The World of Apu." I have watched the trilogy in a continuous sitting on two occasions, and the experience was emotionally overwhelming.


In [None]:
review = "Honestly I don't really understand how this movie could get a place in the top 250 movies list. I was cringing the whole time which as expected from an Indian movie was not a short time to be cringing about. The alien part was really stupid and not really important at all actually in the movie as it focused more on religion which I don't mind but the questions asked by PK were mostly oversimplified. The jokes were uncomfortably lame and the acting just plain bad. Otherwise the production isn't bad at all as I expected and the songs are good enough. I mostly like Indian movies for I'm learning about their culture more and understand it better but really that only doesn't justify its place for me in the top list."
print("PK movie review:")
print(review)

PK movie review:
Honestly I don't really understand how this movie could get a place in the top 250 movies list. I was cringing the whole time which as expected from an Indian movie was not a short time to be cringing about. The alien part was really stupid and not really important at all actually in the movie as it focused more on religion which I don't mind but the questions asked by PK were mostly oversimplified. The jokes were uncomfortably lame and the acting just plain bad. Otherwise the production isn't bad at all as I expected and the songs are good enough. I mostly like Indian movies for I'm learning about their culture more and understand it better but really that only doesn't justify its place for me in the top list.


In [None]:
print("hello")

hello


In [None]:
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  Honestly I dont really understand how this movie could get a place in the top  movies list I was cringing the whole time which as expected from an Indian movie was not a short time to be cringing about The alien part was really stupid and not really important at all actually in the movie as it focused more on religion which I dont mind but the questions asked by PK were mostly oversimplified The jokes were uncomfortably lame and the acting just plain bad Otherwise the production isnt bad at all as I expected and the songs are good enough I mostly like Indian movies for Im learning about their culture more and understand it better but really that only doesnt justify its place for me in the top list
Filtered:  ['honestly i dont really understand movie could get place top  movies list i cringing whole time expected indian movie short time cringing the alien part really stupid really important actually movie focused religion i dont mind questions asked pk mostly oversimplified th

In [None]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[ 1121     1  5140    13   291     3    27    19   171   243    28   925
      1  9653   128    10   758  1296     3   240    10  9653     2  1048
     83    13   274    13   559    73     3  2604  2086     1  5140   230
   1086  1580 64334   566     2   509 14924   771    43   924    18   803
    260 13206    18     1   758   622     9   104     1   566     6  1296
     28  4350  2739  1083   291    53    13 15673  4250   171     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


In [None]:
d

In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.03413663]]


In [None]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

negative
