**Embedding + CNN model**

In [1]:
import string
import re
import os
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from pickle import dump, load
import numpy as np

Data Preparation

In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [3]:
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile(f"[{re.escape(string.punctuation)}]")
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [w for w in tokens if len(w) > 1]
    tokens = ' '.join(tokens)
    return tokens

In [4]:
def process_docs(directory, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc)
        documents.append(tokens)
    return documents

In [5]:
def load_clean_dataset(is_train):
    neg = process_docs('review_polarity/txt_sentoken/neg', is_train)
    pos = process_docs('review_polarity/txt_sentoken/pos', is_train)
    docs = neg + pos
    labels = np.array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

In [6]:
def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print(f"Saved: {filename}")

In [7]:
# load and clean all reviews
train_docs, ytrain = load_clean_dataset(True)
test_docs, ytest = load_clean_dataset(False)
# save training datasets
save_dataset([train_docs, ytrain], 'train.pkl')
save_dataset([test_docs, ytest], 'test.pkl')

Saved: train.pkl
Saved: test.pkl


Train CNN with Embedding layer

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Embedding, Input, Dropout
from keras.layers import Conv1D, MaxPooling1D
from keras.models import load_model
from keras.layers import concatenate
from keras.utils import plot_model

In [9]:
def load_dataset(filename):
    return load(open(filename, 'rb'))

In [10]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [11]:
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [12]:
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen = length, padding = 'post')
    return padded

In [13]:
def define_model(length, vocab_size): 
    # channel 1
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    # channel 2
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    # channel 3
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    # merge
    merged = concatenate([flat1, flat2, flat3])
    # interpretation
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
    # compile
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize
    model.summary()
    #plot_model(model, show_shapes=True, to_file='model.png')
    return model

In [14]:
train_lines, train_labels = load_dataset('train.pkl')
tokenizer = create_tokenizer(train_lines)

print(type(train_lines))
print(type(train_lines[0]))
print(train_lines[0])

<class 'list'>
<class 'str'>
plot two teen couples go church party drink drive get accident one guys dies girlfriend continues see life nightmares whats deal watch movie sorta find critique mindfuck movie teen generation touches cool idea presents bad package makes review even harder one write since generally applaud films attempt break mold mess head lost highway memento good bad ways making types films folks didnt snag one correctly seem taken pretty neat concept executed terribly problems movie well main problem simply jumbled starts normal downshifts fantasy world audience member idea whats going dreams characters coming back dead others look like dead strange apparitions disappearances looooot chase scenes tons weird things happen simply explained personally dont mind trying unravel film every give clue get kind fed films biggest problem obviously got big secret hide seems want hide completely final five minutes make things entertaining thrilling even engaging meantime really sad 

In [29]:
length = max_length(train_lines)
print(f"Max Document length: {length}")

vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

trainx = encode_text(tokenizer, train_lines, length)

model = define_model(length, vocab_size)
model.fit([trainx, trainx, trainx], train_labels, epochs = 7, batch_size = 16)
model.save('model.h5')

Max Document length: 1380
Vocabulary size: 44277


Epoch 1/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 210ms/step - accuracy: 0.5395 - loss: 0.6945
Epoch 2/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 210ms/step - accuracy: 0.6938 - loss: 0.6084
Epoch 3/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 211ms/step - accuracy: 0.9954 - loss: 0.0646
Epoch 4/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 221ms/step - accuracy: 0.9999 - loss: 0.0046
Epoch 5/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 208ms/step - accuracy: 1.0000 - loss: 0.0020
Epoch 6/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 213ms/step - accuracy: 1.0000 - loss: 0.0011
Epoch 7/7
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 212ms/step - accuracy: 1.0000 - loss: 7.0749e-04




Evaluate the model

In [30]:
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')

trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)

model = load_model('model.h5')

_, acc = model.evaluate([trainX, trainX, trainX], trainLabels, verbose = 0)
print(f"Train Accuracy: {acc*100}")
_, acc = model.evaluate([testX, testX, testX], testLabels, verbose=0)
print(f"Test Accuracy: {acc*100}")



Train Accuracy: 100.0
Test Accuracy: 86.00000143051147


In [31]:
def predict_sentiment(review, tokenizer, length, model):
    #print(review)
    line = clean_doc(review)
    #print(line)
    padded = encode_text(tokenizer, [line], length)
    #print(padded)
    yhat = model.predict([padded, padded, padded], verbose = 0)
    #print(yhat)
    percent_pos = yhat[0,0]
    #print(percent_pos)
    if percent_pos < 0.5:
        return (1 - percent_pos), 'NEGATIVE'
    return percent_pos, 'POSITIVE'

In [32]:
text = 'Everyone will enjoy this film. I love it, recommended!'
percent, sentiment = predict_sentiment(text, tokenizer, length, model)
print(f"Sentiment: {sentiment} ({percent*100})")

Sentiment: POSITIVE (52.2366464138031)


In [33]:
text = 'Disgusted by this movie. climax was the worst. action sequences were so bad'
percent, sentiment = predict_sentiment(text, tokenizer, length, model)
print(f"Sentiment: {sentiment} ({percent*100})")

Sentiment: NEGATIVE (54.92013096809387)
