In [1]:
import numpy as np
import tensorflow as tf
import math
from tensorflow.contrib.tensorboard.plugins import projector
import os

In [2]:
batch_size = 64
embedding_dimension = 5
negative_samples = 5
LOG_DIR = "logs\\word2vec_intro"

In [3]:
digit_to_word_map = {1: "One", 2: "Two", 3: "Three", 4:"Four", 5:"Five",
                         6:"Six",7:"Seven",8:"Eight",9:"Nine"}

sentences = []

for i in range(10000):
    rand_odd_ints = np.random.choice(range(1, 10, 2), 3)
    sentences += [" ".join([digit_to_word_map[r] for r in rand_odd_ints])]

In [4]:
sentences[:10]

['Three Seven Seven',
 'Five Seven Five',
 'Five Five Seven',
 'Nine Three Nine',
 'Five Three Three',
 'Seven Five Five',
 'One Nine One',
 'One Three Five',
 'Three Nine Three',
 'Three Three Seven']

In [5]:
word2index_map = {}
index = 0
for sent in sentences:
    for word in sent.lower().split():
        if word not in word2index_map:
            word2index_map[word] = index
            index += 1
index2word_map = dict([(index, word) for word, index in word2index_map.items()])
vocabulary_size = len(index2word_map)

In [6]:
skip_gram_pairs = []
for sent in sentences:
    tokenized_sent = sent.lower().split()
    for i in range(1, len(tokenized_sent) - 1):
        word_context_pair = [
            [
                word2index_map[tokenized_sent[i-1]],
                 word2index_map[tokenized_sent[i+1]]
            ],
             word2index_map[tokenized_sent[i]]
        ]
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][0]
        ])
        skip_gram_pairs.append([word_context_pair[1],
                                word_context_pair[0][1]])

def get_skipgram_batch(batch_size):
    instance_indices = list(range(len(skip_gram_pairs)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [skip_gram_pairs[i][0] for i in batch]
    y = [[skip_gram_pairs[i][1]] for i in batch]
    return x,y

In [7]:
skip_gram_pairs[0:10]

[[1, 0],
 [1, 1],
 [1, 2],
 [1, 2],
 [2, 2],
 [2, 1],
 [0, 3],
 [0, 3],
 [0, 2],
 [0, 0]]

In [8]:
x_batch, y_batch = get_skipgram_batch(8)

In [9]:
x_batch

[1, 1, 4, 4, 1, 1, 2, 3]

In [10]:
y_batch

[[0], [2], [4], [3], [2], [2], [0], [3]]

In [11]:
[index2word_map[word] for word in x_batch]

['seven', 'seven', 'one', 'one', 'seven', 'seven', 'five', 'nine']

In [12]:
[index2word_map[word[0]] for word in y_batch]

['three', 'five', 'one', 'nine', 'five', 'five', 'three', 'nine']

In [13]:
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [14]:
with tf.device('/cpu:0'):
    with tf.name_scope("embeddings"):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_dimension], -1.0, 1.0), name='embedding')
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        

In [15]:
# The Noise-Contrastive Estimation (NCE) loss

nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_dimension], stddev=1.0 / math.sqrt(embedding_dimension))
)
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, tf.cast(train_labels, dtype=tf.float32), tf.cast(embed, dtype=tf.float32), negative_samples, vocabulary_size))

tf.summary.scalar("NCE_loss", loss)

<tf.Tensor 'NCE_loss:0' shape=() dtype=string>

In [16]:
# Learning rate decay

global_step = tf.Variable(0, trainable=False)
learningRate = tf.train.exponential_decay(learning_rate=0.1, 
                                                                         global_step=global_step,
                                                                         decay_steps=1000,
                                                                         decay_rate=0.95,
                                                                         staircase=True)
train_step = tf.train.GradientDescentOptimizer(learningRate).minimize(loss)

In [17]:
# Training and visualizing with TensorBoard

merged = tf.summary.merge_all()

with tf.Session() as sess:
    train_writer = tf.summary.FileWriter(LOG_DIR, graph=tf.get_default_graph())
    saver = tf.train.Saver()
    
    with open(os.path.join(LOG_DIR, 'metadata.tsv'), 'w') as metadata:
        metadata.write('Name\tClass\n')
        for k,v in index2word_map.items():
            metadata.write('%s\t%d\n' % (v, k))
    metadata.close()
    
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embeddings.name
    
    embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
    projector.visualize_embeddings(train_writer, config)
    
    tf.global_variables_initializer().run()
    
    for step in range(1000):
        x_batch, y_batch = get_skipgram_batch(batch_size)
        summary, _ = sess.run([merged, train_step], feed_dict={train_inputs: x_batch, train_labels: y_batch})
        train_writer.add_summary(summary, step)
        
        if step % 100 == 0:
            saver.save(sess, os.path.join(LOG_DIR, "w2v_model.ckpt"), step)
            loss_value = sess.run(loss, feed_dict={train_inputs: x_batch, train_labels: y_batch})
            print("Loss at %d: %.5f" % (step, loss_value))
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    normalized_embeddings_matrix = sess.run(normalized_embeddings)

Loss at 0: 4.69673
Loss at 100: 2.73562
Loss at 200: 2.71576
Loss at 300: 2.73619
Loss at 400: 2.70297
Loss at 500: 2.73865
Loss at 600: 2.70717
Loss at 700: 2.70096
Loss at 800: 2.71830
Loss at 900: 2.73197


In [18]:
ref_word = normalized_embeddings_matrix[word2index_map["one"]]
cosine_dists = np.dot(normalized_embeddings_matrix, ref_word)
ff = np.argsort(cosine_dists)[::-1][1:10]

for f in ff:
    print(index2word_map[f])
    print(cosine_dists[f])
    

three
0.650077
nine
0.500703
five
0.292319
seven
0.211901
