Basic sentiment analysis trained on imdb dataset.

In [1]:
import keras
import tensorflow as tf
import numpy as np
import pickle
import bcolz
import re
import h5py

Using TensorFlow backend.


# Pickle glove word embeddings

In [2]:
def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

In [3]:
def load_array(fname):
    return bcolz.open(fname)[:]

In [4]:
def unpack_glove():
    path = 'glove/'
    name = '6B.50d'
    res_path = 'glove/pickled/'
    with open(path+ 'glove.' + name + '.txt', 'r') as f: lines = [line.split() for line in f]
    words = [d[0] for d in lines]
    vecs = np.stack(np.array(d[1:], dtype=np.float32) for d in lines)
    wordidx = {o:i for i,o in enumerate(words)}
    save_array(res_path+name+'.dat', vecs)
    pickle.dump(words, open(res_path+name+'_words.pkl','wb'))
    pickle.dump(wordidx, open(res_path+name+'_idx.pkl','wb'))

In [11]:
# Only run this once
# unpack_glove()

# Unpack embeddings

In [5]:
def load_glove():
    loc = 'glove/pickled/6B.50d'
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [6]:
glove_vecs, glove_words, glove_word2id = load_glove()

In [19]:
# Get word embeddings like this:
# embed = glove_vecs[glove_word2id['doctor']]

# Load test data (imdb)

In [7]:
# Get the imdb embeddings
from keras.datasets import imdb
# These are arranged by frequency
imdb_word2id = imdb.get_word_index()
imdb_id2word = {v: k for k, v in imdb_word2id.items()}

In [8]:
# Get the imdb review data
from keras.utils.data_utils import get_file
reviews_path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
with open(reviews_path, 'rb') as f:
    (x_train, labels_train), (x_test, labels_test) = pickle.load(f)
# X_train has 25,000 reviews and X_test also has 25,000

# Truncate data

In [9]:
# Reduce vocab size by setting all rare words to max_index
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [10]:
from itertools import chain
# Check dataset sentence length
lengths = list(chain(map(len, trn)))
(np.max(lengths), np.min(lengths), np.mean(lengths))

(2493, 10, 237.71364)

In [11]:
from keras.preprocessing import sequence
# Reduce sentence length and pad up to that length
seq_len = 500

trn = sequence.pad_sequences(trn, maxlen=seq_len, value=0)
test = sequence.pad_sequences(test, maxlen=seq_len, value=0)

In [12]:
from numpy.random import normal
# Create embedding to join keras/imdb embeddings to glove's
def create_emb():
    n_fact = glove_vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = imdb_id2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            # If word is real, in imdb and not in glove, it will
            # cause a problem here
            src_id = glove_word2id[word]
            emb[i] = glove_vecs[src_id]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [13]:
emb = create_emb()

# Construct Keras CNN

In [16]:
from keras import backend as K
from keras.models import Sequential
from keras.layers.core import Dropout, Dense, Flatten
from keras.layers.convolutional import Convolution1D
from keras.layers import Embedding, merge
from keras.layers.pooling import MaxPooling1D
from keras.optimizers import Adam

In [17]:
K.clear_session()

In [18]:
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, 
              weights=[emb], trainable=False),
    Dropout(0.25),
    Convolution1D(64, 5, activation='relu', padding='same'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [19]:
# Note: may have to weight this loss if the classes aren't balanced
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [20]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f8628090668>

In [21]:
# Now allow it to train embedding as well
model.layers[0].trainable=True
model.optimizer.lr=1e-4

In [22]:
model.fit(trn, labels_train, validation_data=(test, labels_test), epochs=4, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f86333b90b8>

In [23]:
model_path = 'model/'

In [24]:
model.save_weights(model_path+'glove50.h5')

In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 50)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 500, 64)           16064     
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1600100   
__________

In [54]:
def preprocess_for_prediction(text):
    
    text = text.lower()
    text = text.split()
    lst = []
    for i, word in enumerate(text):
        if word in imdb_word2id:
            lst.append(imdb_word2id[word])
        else:
            lst.append(vocab_size - 1)
    out = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in lst])]
    out = sequence.pad_sequences(out, maxlen=seq_len, value=0)
    return out

In [58]:
sample_bad_review = "I really didn't like this one at all. It stunk. That's the god honest truth. It was a real stinker."

In [59]:
sample_in = preprocess_for_prediction(sample_bad_review)

In [60]:
prediction = model.predict(sample_in)

In [61]:
print(prediction)

[[ 0.43753284]]


In [62]:
sample_good_review = "What a film! Wow! I loved the actors, the actresses, the special effects. And holy moly the writing was a thing of beauty!"

In [63]:
sample_in = preprocess_for_prediction(sample_good_review)

In [64]:
prediction = model.predict(sample_in)

In [65]:
print(prediction)

[[ 0.55831337]]


Meh. It does work, though it's not quite as confident as I would have liked.