In [None]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout,Conv1D,GlobalMaxPooling1D # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
nltk.download('stopwords')
english_stops = stopwords.words('english')
print(english_stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def load_dataset():
    df = pd.read_csv('/content/train.csv',header=None, nrows=100000)
    df.columns = ['Polarity', 'Title', 'Review']
    df = df[['Polarity', 'Review']].reset_index(drop=True)
    x_data = df['Review']       # Reviews/Input
    y_data = df['Polarity']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # #ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace(1, 0)
    y_data = y_data.replace(2, 1)


    return x_data, y_data

x_data, y_data = load_dataset()

print('Review')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Review
0        [this, sound, track, beautiful, it, paints, se...
1        [i, reading, lot, reviews, saying, best, game,...
2        [this, soundtrack, favorite, music, time, hand...
3        [i, truly, like, soundtrack, i, enjoy, video, ...
4        [if, played, game, know, divine, music, every,...
                               ...                        
99995    [good, excelent, fantastic, wonderful, muy, bu...
99996    [these, are, the, real, adventures, of, the, r...
99997    [sylvain, beauregard, widely, known, die, hard...
99998    [i, think, i, know, person, wrote, book, i, ce...
99999    [this, call, overkill, how, many, books, need,...
Name: Review, Length: 100000, dtype: object 

Sentiment
0        1
1        1
2        1
3        1
4        1
        ..
99995    1
99996    0
99997    0
99998    0
99999    0
Name: Polarity, Length: 100000, dtype: int64


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
26942    [i, really, impressed, reviews, i, read, decid...
15487    [seriously, first, review, title, nailed, i, s...
82520    [the, book, i, reviewing, matilda, written, ro...
62719    [i, using, preserver, past, months, it, consis...
93277    [the, jacket, nice, size, i, wear, xxl, shirts...
                               ...                        
22677    [this, plots, plans, action, while, might, pic...
51253    [dear, sir, madam, i, ordered, book, order, so...
82093    [countryman, glimpse, world, small, fishing, v...
62557    [this, outstanding, movie, years, ago, ran, th...
65470    [it, good, basic, sign, dictionary, i, thought...
Name: Review, Length: 80000, dtype: object 

95111    [wow, when, look, songs, cd, see, list, tmbg, ...
92433    [great, series, terrific, actors, a, series, i...
37062    [this, book, tripe, feed, vampire, community, ...
25881    [unoriginal, top, think, pop, punk, thing, run...
60876    [if, like, pirates, caribbean, columbia, ship,...
 

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()
print(max_length)
x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

44
Encoded X Train
 [[    1    15   916 ...     0     0     0]
 [  909    17   216 ...     0     0     0]
 [    2     3     1 ...   121  4445   655]
 ...
 [46483  4185   121 ...     0     0     0]
 [    5  1231     8 ...     0     0     0]
 [    6     9   724 ...     0     0     0]] 

Encoded X Test
 [[  965   154   112 ...    26    84 10795]
 [   10    94  1524 ...     0     0     0]
 [    5     3  6017 ...     0     0     0]
 ...
 [    5     3   650 ...     0     0     0]
 [    5     3  1983 ...     0     0     0]
 [    5    17     3 ...     0     0     0]] 

Maximum review length:  44


In [None]:
EMBED_DIM = 32

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(Conv1D(filters = 64, kernel_size = 3, strides= 1, padding='same', activation= 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units = 256, activation= 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 44, 32)            2890656   
                                                                 
 conv1d_4 (Conv1D)           (None, 44, 64)            6208      
                                                                 
 global_max_pooling1d_4 (Gl  (None, 64)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_8 (Dense)             (None, 256)               16640     
                                                                 
 dropout_4 (Dropout)         (None, 256)               0         
                                                                 
 dense_9 (Dense)             (None, 1)                 257       
                                                      

In [None]:
model.fit(x_train, y_train,validation_data=(x_test, y_test), batch_size = 128, epochs = 2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7fed0dd47160>

In [None]:
y_pred =  (model.predict(x_test) > 0.5).astype("int32")

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 16898
Wrong Prediction: 3102
Accuracy: 84.49


In [None]:
model.save('CNN_AmazonReview_T1.keras')



In [None]:
loaded_model = load_model('CNN_IMDB_T1.keras')

In [None]:
review = str(input('Movie Review: '))

Movie Review:  It was good


In [None]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  It was good
Filtered:  ['it good']


In [None]:
   # no need lower, because already lowered the data in load_data()
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length , padding='post', truncating='post')
print(tokenize_words)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.06568262]]


In [None]:
if result >= 0.5:
    print('positive')
else:
    print('negative')

negative
