In [135]:
#CSC620
#HA15
#Paula Abigail Tam <921850992>

#Lesson 7: Movie Review Sentiment Analysis Project
#For this assignment, I also followed these blog posts (as they were suggested in the original blog):
#https://machinelearningmastery.com/prepare-movie-review-data-sentiment-analysis/
#https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/

In [136]:
from string import punctuation
from nltk.corpus import stopwords
from os import listdir
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from gensim.models import Word2Vec

In [137]:
def load_doc(filename): #this fucntion loads the document
    file = open(filename, 'r') #open file as read only
    text = file.read() #read the text in the file
    file.close() #close the file
    return text #return the text in the file

In [138]:
def clean_doc(doc): #function to clean the documents
    tokens = doc.split() #split into tokens by white space
    table = str.maketrans('', '', punctuation) #to remove punctuation from each token
    tokens = [word.translate(table) for word in tokens] #apply it to all the words; https://python-reference.readthedocs.io/en/latest/docs/str/translate.html
    #remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    #filter stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    #filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [139]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [140]:
def process_docs(directory, vocab, is_train): #load all docs in a directory
    lines = list()
    for filename in listdir(directory): #all files in a folder
        if not filename.endswith(".txt"): #skip files that isn't a .txt
            continue
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
             continue
        path = directory + '/' + filename  #create the full path of the file to open
        doc = load_doc(path)  #load the document
        line = doc_to_line(path, vocab)
        #add to list
        lines.append(line)
        return lines

In [141]:
# load embedding as a dict
def load_embedding(filename):
 # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
 # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
 # key is string word, value is numpy array for vector
    embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
    return embedding

In [142]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
 # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
 # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 100))
 # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

In [143]:
# load vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [144]:
# load all training reviews
positive_docs = process_docs('./review_polarity/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('./review_polarity/txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs

In [145]:
# train word2vec model
model = Word2Vec(train_docs, vector_size=100, window=5, workers=8, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.key_to_index)
print('Vocabulary size: %d' % len(words))
 
# save model in ASCII (word2vec) format
filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

Vocabulary size: 27


In [146]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

In [147]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

In [148]:
# load all test reviews
positive_docs = process_docs('./review_polarity/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('./review_polarity/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [149]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
 
# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)

In [150]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 520, 100)          58500     
                                                                 
 conv1d (Conv1D)             (None, 516, 128)          64128     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 258, 128)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 33024)             0         
                                                                 
 dense (Dense)               (None, 1)                 33025     
                                                                 
Total params: 155,653
Trainable params: 97,153
Non-trainable params: 58,500
____________________________________________

ValueError: Data cardinality is ambiguous:
  x sizes: 2
  y sizes: 1800
Make sure all arrays contain the same number of samples.