In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout,Conv1D,MaxPooling1D # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [None]:

english_stops = stopwords.words('english')

In [None]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words and spliting
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
40961    [strangers, candy, overacts, wrong, context, s...
49012    [this, christopher, guest, movie, rivals, spin...
32108    [i, saw, film, edinburgh, film, festival, woul...
27671    [one, look, rating, ought, tell, movie, voted,...
45688    [i, watched, first, episode, the, war, home, i...
                               ...                        
13670    [such, highly, anticipated, remake, cherished,...
8791     [ok, cons, first, the, obligatory, alligator, ...
35733    [my, yardstick, measuring, movie, watch, abili...
25444    [i, expecting, love, movie, film, noir, serial...
26190    [granny, definitely, one, worst, horror, movie...
Name: review, Length: 40000, dtype: object 

10518    [i, huge, john, denver, fan, i, large, collect...
7914     [this, one, would, term, happy, tale, the, tit...
21671    [between, twentieth, century, fox, made, ton, ...
39727    [surely, best, film, directed, claude, lelouch...
2319     [some, films, manage, survive, almost, origina...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 4217  1951  8567 ...     0     0     0]
 [    8  1328  2622 ...   399     3   859]
 [    1   122     4 ...     0     0     0]
 ...
 [  219 28483 24779 ...   275   303   364]
 [    1   905    41 ...     0     0     0]
 [ 7897   313     5 ...     0     0     0]] 

Encoded X Test
 [[   1  538  217 ...    0    0    0]
 [   8    5   12 ...  494  240 4126]
 [7482 9570  941 ...  930   40  109]
 ...
 [   1  881 4278 ... 3270  443  752]
 [  78    1   47 ...  693 1617 7707]
 [ 107  348   39 ...    0    0    0]] 

Maximum review length:  130


In [None]:
EMBED_DIM = 32
LSTM_OUT = 64
FILTER_SIZE = 3
NUM_FILTERS = 32
POOL_SIZE = 2

model = Sequential()

# Add a CNN layer
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(Conv1D(filters = 64, kernel_size = 3, strides= 1, padding='same', activation= 'relu'))
model.add(MaxPooling1D(pool_size=POOL_SIZE))
model.add(Dropout(0.2))

# Add the LSTM layer
model.add(LSTM(LSTM_OUT))
model.add(Dropout(0.2))

# Add the output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 130, 32)           2957824   
                                                                 
 conv1d_1 (Conv1D)           (None, 130, 64)           6208      
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 65, 64)            0         
 g1D)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 65, 64)            0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                      

In [None]:
model.fit(x_train, y_train,validation_data=(x_test, y_test), batch_size = 128, epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f2e48fb2080>

In [None]:
y_pred =  (model.predict(x_test) > 0.7).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8769
Wrong Prediction: 1231
Accuracy: 87.69


In [None]:
model.save('CNN_LSTM_IMDB_T1.keras')

In [None]:
loaded_model = load_model('CNN_LSTM_IMDB_T1.keras')