Implementing a Wor2Vec Model <br>
&#9673; Source: https://adventuresinmachinelearning.com/word2vec-keras-tutorial/

In [12]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib
import collections
import os
import zipfile

import numpy as np
import pandas as pd
import tensorflow as tf

def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

In [4]:
#Standard Setup - DONT RUN EVERYTIME-FILE ALREADY DOWNLOADED
###########################
#the function checks the size of the file and makes sure it lines up with the expected file size, expected_bytes.
dataDir = "Datasets/"
plotDir = "Datasets/Plots/"

#Data source
url = 'http://mattmahoney.net/dc/'
filename = maybe_download('text8.zip', url, 31344016)

Found and verified text8.zip


In [23]:
# Read the data into a list of strings.
#Note: tf.compat.as_str comes from the fact that in Python 2, strings were dealt with primarily as bytes, not unicode.
#In Python 3, all strings are natively unicode. This function ensures that whichever Python version you're using, 
#you won't be bothered, hence the compat module
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        #data = pd.Series(f.read(f.namelist()[0]).decode()).str.split()  #.decode used to convert bytes to unicode
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

#This function uses the above two functions
def collect_data(vocabulary_size):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary


In [24]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']
[5234, 3081, 12, 6, 195, 2, 3134]


In [26]:
window_size = 3 #window of words around the target word that will be used to draw the context words from
vector_dim = 300 #size of each of our word embedding vectors (embedding layer will be of size 10,000 x 300)
epochs = 1000000 

#Randomly check to see what other words grow in similarity to this validation set.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [None]:
#keras skip-gram function - returns the word couples in the form of (target, context) 
#and also gives a matching label of 1 or 0 depending on whether context is a true context word or a negative sample. 
#By default, it returns randomly shuffled couples and labels. 
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

In [None]:
#Using keras functional apis - we are going to do something tricky: the sharing of a single embedding layer between two 
#tensors, and an auxiliary output to measure similarity – and therefore we can’t use a straightforward 
#sequential implementation.

# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')

In [None]:
# The embedding vector is easily retrieved by supplying the word integer (i.e. input_target and input_context)
# in brackets to the previously created embedding operation/layer. For each word vector, we then use a Keras 
# Reshape layer to reshape it ready for our upcoming dot product and similarity operation, as per our architecture.

target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

In [None]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = merge([target, context], mode='cos', dot_axes=0)

In [None]:
# now perform the dot product operation to get a similarity measure
dot_product = merge([target, context], mode='dot', dot_axes=1)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

In [None]:
# create the primary training model
model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

In [None]:
# create a secondary validation model to run our similarity checks during training

# We can now use this validation_model to access the similarity operation, and this model will actually 
# share the embedding layer with the primary model.  Note, because this model won’t be involved in training, 
# we don’t have to run a Keras compile operation on it.

validation_model = Model(input=[input_target, input_context], output=similarity)

In [None]:
# We want to create a “callback” which we can use to figure out which words are closest in 
# similarity to our validation examples, so we can monitor the training progress of our embedding layer.

class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        for i in range(vocab_size):
            in_arr1[0,] = valid_word_idx
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [None]:
#Main Training loop in the model

# In this loop, we run through the total number of epochs.  First, we select a random index from our word_target, 
# word_context and labels arrays and place the values in dummy numpy arrays.  Then we supply the input 
# ([word_target, word_context]) and outputs (labels) to the primary model and run a train_on_batch() operation.  
# This returns the current loss evaluation, loss, of the model and prints it. Every 10,000 iterations we also run functions 
# in the SimilarityCallback.

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if i % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        sim_cb.run_sim()