# Sentiment Analysis with Pre-trained Word Embeddings

In [1]:
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


# Let's Define some Helper Functions

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r', encoding="utf8")
    lines = file.readlines()[1:]
#    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix


# Retrieve Movie Reviews Vocabulary

In [4]:
# load the vocabulary
vocab_filename = 'data/movie_vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# Let's prepare our training examples

### Note that we pass "True" to process_docs() so that we only return the training cases

In [5]:
# load all training reviews
positive_docs = process_docs('data/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('data/txt_sentoken/neg', vocab, True)
train_docs = negative_docs + positive_docs

# Fit tokenizer to our document set

### We will use the tokenizer to sequence our documents based on a word index of our vocabulary.

In [7]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
print(tokenizer.word_index)



# Text Encoding

### We will first encode our text with our vocabulary word index we just created. 

In [8]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
print(encoded_docs[0])

[25, 16, 899, 3475, 46, 2103, 690, 3381, 1284, 14, 1164, 2, 377, 1149, 612, 1439, 18, 24, 4105, 445, 522, 136, 3, 9981, 94, 2000, 13894, 3, 899, 1297, 2068, 553, 256, 1454, 27, 3686, 56, 349, 5, 3476, 2, 1053, 92, 1175, 9143, 9, 409, 748, 6618, 754, 285, 222, 3563, 7910, 6, 27, 619, 149, 2484, 9, 1517, 137, 19605, 2, 6619, 138, 492, 159, 3382, 1133, 2756, 2273, 355, 3, 20, 233, 226, 199, 7005, 311, 1092, 19606, 1251, 59, 72, 990, 256, 445, 71, 914, 13, 402, 52, 273, 325, 79, 4, 273, 604, 13895, 16192, 815, 34, 4564, 1265, 77, 482, 199, 1872, 2608, 39, 218, 150, 9144, 1, 60, 147, 2683, 14, 152, 4251, 9, 776, 226, 476, 182, 61, 662, 3029, 51, 154, 3029, 200, 253, 419, 122, 21, 77, 326, 2883, 5, 1640, 5168, 22, 1176, 107, 4401, 5402, 1455, 4, 78, 3564, 1922, 113, 11016, 329, 21, 23, 151, 142, 50, 137, 21, 1, 326, 493, 1611, 229, 30, 4, 143, 21, 177, 72, 5, 173, 662, 19607, 1736, 59, 2069, 497, 838, 7006, 9145, 308, 114, 3935, 122, 350, 3, 1587, 3192, 924, 14, 26, 2806, 39, 38, 22, 320, 18

# Let's Prepare our Training Set

### We will also pad our sentences to a fixed size based on the maximium length document in our corpus.

In [9]:
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

# Let's Prepare our Test Set

### Note that we pass "False" to process_docs() so that we only return the test cases

In [11]:
# load all test reviews
positive_docs = process_docs('data/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('data/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

# Vocabulary Size

### Why are we adding 1 to our vocabulary size?

In [12]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# Load Pre-trained Word Embeddings

In [13]:
# load embedding from file
raw_embedding = load_embedding('data/movie_embeddings_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)

# Convolutional Neural Network Model

In [15]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1313, 128)         64128     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 656, 128)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 83968)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 83969     
Total params: 2,724,897
Trainable params: 148,097
Non-trainable params: 2,576,800
_________________________________________________________________
None


# Compile and Train the model

### Adam Optimizer: Computes individual adaptive learning rates for different parameters from estimates of first and second moments of the gradients. 

In [16]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
 - 4s - loss: 0.7161 - acc: 0.5228
Epoch 2/10
 - 3s - loss: 0.6701 - acc: 0.5989
Epoch 3/10
 - 3s - loss: 0.6089 - acc: 0.6744
Epoch 4/10
 - 3s - loss: 0.4944 - acc: 0.7678
Epoch 5/10
 - 3s - loss: 0.3852 - acc: 0.8361
Epoch 6/10
 - 3s - loss: 0.2876 - acc: 0.9033
Epoch 7/10
 - 3s - loss: 0.2192 - acc: 0.9328
Epoch 8/10
 - 3s - loss: 0.1475 - acc: 0.9622
Epoch 9/10
 - 3s - loss: 0.1122 - acc: 0.9767
Epoch 10/10
 - 3s - loss: 0.0749 - acc: 0.9967


<keras.callbacks.History at 0x67ddcf8>

# Let's Test on our Holdout Set

In [17]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 52.500000


# GloVe

In [28]:
# load embedding from file
raw_embedding = load_embedding('glove.6B/glove.6B.100d.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)