In [1]:
import json
import random
with open("data/imdb_train.json") as f:
    data=json.load(f)
random.shuffle(data) 
print(data[0])

# We need to gather the texts, into a list
texts=[one_example["text"] for one_example in data]
labels=[one_example["class"] for one_example in data]
print(texts[:2])
print(labels[:2])

{'text': "I started to watch this movie expecting nothing, just another movie to watch, but since the first twenty minutes, the artwork and main character, who is enigmatic, doesn't talk much, really got me in this movie.  I really liked this movie, it was dark, beautifully acted and really touching. It's a bit slow but the immersion was complete. The directing was awesome by letting us know bits by bits the story leading to the conviction of Joey and his life behind bars. The music was really great and very well incorporated into the scenes. The ending was unexpected with a twist I didn't see coming. It's not the kind of movie we see often.", 'class': 'pos'}
["I started to watch this movie expecting nothing, just another movie to watch, but since the first twenty minutes, the artwork and main character, who is enigmatic, doesn't talk much, really got me in this movie.  I really liked this movie, it was dark, beautifully acted and really touching. It's a bit slow but the immersion was 

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy
analyzer=CountVectorizer(lowercase=False).build_analyzer() # includes tokenizer and preprocessing
print(analyzer(texts[0]))


#vectorizer=vectorizer.fit(texts) # learns a vocabulary dictionary
#print("Vocabulary size:",len(vectorizer.vocabulary_))
#print("First 5 items in the vocabulary:",list(vectorizer.vocabulary_.keys())[:5])
#print("How many words are recognized from the data:",numpy.count_nonzero(vectorizer.transform(["i went today to new_york"]).todense()))

['started', 'to', 'watch', 'this', 'movie', 'expecting', 'nothing', 'just', 'another', 'movie', 'to', 'watch', 'but', 'since', 'the', 'first', 'twenty', 'minutes', 'the', 'artwork', 'and', 'main', 'character', 'who', 'is', 'enigmatic', 'doesn', 'talk', 'much', 'really', 'got', 'me', 'in', 'this', 'movie', 'really', 'liked', 'this', 'movie', 'it', 'was', 'dark', 'beautifully', 'acted', 'and', 'really', 'touching', 'It', 'bit', 'slow', 'but', 'the', 'immersion', 'was', 'complete', 'The', 'directing', 'was', 'awesome', 'by', 'letting', 'us', 'know', 'bits', 'by', 'bits', 'the', 'story', 'leading', 'to', 'the', 'conviction', 'of', 'Joey', 'and', 'his', 'life', 'behind', 'bars', 'The', 'music', 'was', 'really', 'great', 'and', 'very', 'well', 'incorporated', 'into', 'the', 'scenes', 'The', 'ending', 'was', 'unexpected', 'with', 'twist', 'didn', 'see', 'coming', 'It', 'not', 'the', 'kind', 'of', 'movie', 'we', 'see', 'often']


In [3]:
from gensim.models import KeyedVectors

vector_model=KeyedVectors.load_word2vec_format("data/GoogleNews-vectors-negative300.bin", binary=True, limit=50000)

# sort based on the index to make sure they are in the correct order
# lowercase everything because that is the dafeault setting in CountVectorizer
words=[k for k,v in sorted(vector_model.vocab.items(), key=lambda x:x[1].index)]
print("Words from embedding model:",len(words))
print("First 50 words:",words[:50])

Words from embedding model: 50000
First 50 words: ['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said', 'was', 'the', 'at', 'not', 'as', 'it', 'be', 'from', 'by', 'are', 'I', 'have', 'he', 'will', 'has', '####', 'his', 'an', 'this', 'or', 'their', 'who', 'they', 'but', '$', 'had', 'year', 'were', 'we', 'more', '###', 'up', 'been', 'you', 'its', 'one', 'about', 'would', 'which', 'out']


In [4]:
# init the vectorizer vocabulary using words from the embedding model

def init_vocabulary(vocab, text, text_analyzer):
    for word in analyzer(text):
        vocab.setdefault(word, len(vocab))
    return vocab

words_txt=" ".join(words)
vocabulary={"<SPECIAL>": 0} # zero has a special meaning, prevent using it for a normal word
vocabulary=init_vocabulary(vocabulary,words_txt,analyzer)
print("Words from embedding model:",len(vocabulary))
#for word in words:
#    if word not in vocab:
#        print(word)
#    vocab.setdefault(word, len(vectorizer.vocabulary_)) # setdefault adds the word if it does not already exist
#vectorizer.vocabulary_=vocab
#print("Vocabulary size:",len(vectorizer.vocabulary_))
#print("First 5 items in the vocabulary:",list(vectorizer.vocabulary_.keys())[:5])
#print("How many words are recognized from the data:",numpy.count_nonzero(vectorizer.transform(["i went today to new_york"]).todense()))

Words from embedding model: 49412


In [5]:
def vectorizer(vocab, texts):
    vectorized_data=[] # turn text into numbers based on our vocabulary mapping
    for one_example in texts:
        vectorized_example=[]
        for word in analyzer(one_example):
            vocab.setdefault(word, len(vocab)) # add word to out vocabulary if it does not exist
            vectorized_example.append(vocab[word])
        vectorized_data.append(vectorized_example)
    
    vectorized_data=numpy.array(vectorized_data) # turn python list into numpy matrix
    return vectorized_data

vectorized_data=vectorizer(vocabulary, texts)

# now vectorized data is in the same as feature_matrix, but in slightly different format
print("Words in vocabulary:",len(vocabulary))
print("Vectorized data shape:",vectorized_data.shape)
print("Firs example vectorized:",vectorized_data[0])
        

Words in vocabulary: 111151
Vectorized data shape: (25000,)
Firs example vectorized: [416, 49412, 1187, 25, 1081, 3339, 740, 70, 183, 1081, 49412, 1187, 30, 131, 10, 51, 7392, 424, 10, 9830, 49413, 795, 1929, 28, 4, 30515, 49414, 808, 139, 213, 179, 157, 1, 25, 1081, 213, 3903, 25, 1081, 14, 9, 2789, 13220, 5281, 49413, 213, 8267, 46, 705, 1754, 30, 10, 27730, 9, 889, 6, 7864, 9, 5981, 17, 4750, 152, 163, 9896, 17, 9896, 10, 520, 430, 49412, 10, 4291, 49415, 11044, 49413, 23, 239, 446, 4921, 6, 603, 9, 213, 247, 49413, 129, 104, 8078, 64, 10, 3887, 6, 1966, 9, 5137, 7, 7977, 49416, 146, 443, 46, 12, 10, 593, 49415, 1081, 34, 146, 606]


In [6]:
#feature_matrix=vectorizer.transform(texts)
#print(feature_matrix.shape)

# labels
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder=LabelEncoder() #Turns class labels into integers
one_hot_encoder=OneHotEncoder(sparse=False) #Turns class integers into one-hot encoding
class_numbers=label_encoder.fit_transform(labels)
print("class_numbers shape=",class_numbers.shape)
print("class_numbers",class_numbers)
print("class labels",label_encoder.classes_)
#And now yet the one-hot encoding
classes_1hot=one_hot_encoder.fit_transform(class_numbers.reshape(-1,1))
print("classes_1hot",classes_1hot)


class_numbers shape= (25000,)
class_numbers [1 0 0 ... 1 1 1]
class labels ['neg' 'pos']
classes_1hot [[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


## Network

In [7]:

def load_pretrained_embeddings(vocab, embedding_model):
    """ vocab: vocabulary from our data vectorizer, embedding_model: model loaded with gensim """
    import string
    pretrained_embeddings=numpy.zeros((len(vocab),embedding_model.vectors.shape[1])) # initialize new matrix (words x embedding dim)
    found=0
    for word,idx in vocab.items():
        if word in embedding_model.vocab:
            pretrained_embeddings[idx]=embedding_model.get_vector(word)
            found+=1
            
    print("Found pretrained vectors for {found} words.".format(found=found))
    return pretrained_embeddings

pretrained=load_pretrained_embeddings(vocabulary, vector_model)
print("Shape of pretrained embeddings:",pretrained.shape)


Found pretrained vectors for 48923 words.
Shape of pretrained embeddings: (111151, 300)


In [8]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers.pooling import AveragePooling1D

from keras.preprocessing.sequence import pad_sequences

vectorized_data_padded=pad_sequences(vectorized_data, maxlen=100, padding='post', truncating='post')
print("New shape:", vectorized_data_padded.shape)
example_count,sequence_len=vectorized_data_padded.shape
example_count,class_count=classes_1hot.shape

#example_count,feature_count=feature_matrix.shape

#example_count,class_count=classes_1hot.shape

inp=Input(shape=(sequence_len,))
embeddings=Embedding(len(vocabulary),200)(inp)
pooling=AveragePooling1D(pool_size=sequence_len)(embeddings)
flattened=Flatten()(pooling) # removes extra dimension
#hidden=Dense(200, activation="tanh")(flattened)
outp=Dense(class_count,activation="softmax")(flattened)
model=Model(inputs=[inp], outputs=[outp])
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['accuracy'])

print(model.summary())

hist=model.fit(vectorized_data_padded,classes_1hot,batch_size=100,verbose=1,epochs=1,validation_split=0.1)

Using TensorFlow backend.


New shape: (25000, 100)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 200)          22230200  
_________________________________________________________________
average_pooling1d_1 (Average (None, 1, 200)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 402       
Total params: 22,230,602
Trainable params: 22,230,602
Non-trainable params: 0
_________________________________________________________________
None
Train on 22500 samples, validate on 2500 samples
Epoch 1/1
