## Short Intro To Embeddings with Tensorflow

Goals
- Understand Embedding 
- Perform Embedding Lookup using Tensorflow
- Use Pre-Trained Embedding 

In [2]:
import tensorflow as tf
import numpy as np
import os
print('Tensorflow version : {0}'.format(tf.__version__))

Tensorflow version : 1.5.0


### Sample Data

In [3]:
embedding_size = 5
vocabulary_size = 10

# create a sample embedding matrix of size 5 for vocab of size 10
embedding = np.random.rand(vocabulary_size, embedding_size)
print(embedding)

[[0.8474672  0.63517459 0.20856734 0.62158914 0.1706803 ]
 [0.01427033 0.29586182 0.15395211 0.8114454  0.92883602]
 [0.07006209 0.93486385 0.87660798 0.93495023 0.93246372]
 [0.75529191 0.45670366 0.83832113 0.96170286 0.20207429]
 [0.60661999 0.79176031 0.84172283 0.90355146 0.82368189]
 [0.9636877  0.76228184 0.55074808 0.30381757 0.82599705]
 [0.34921163 0.17196946 0.79534164 0.39571298 0.43468079]
 [0.8603585  0.87752959 0.3065835  0.02131077 0.3528051 ]
 [0.81300056 0.17322652 0.32041377 0.74049448 0.97602482]
 [0.36918957 0.52890363 0.56712384 0.8195898  0.97569215]]


In [4]:
# create one-hot encoding for one of element in vocabulary
i = 4
one_hot = np.zeros(10)
one_hot[i] = 1.0
print(one_hot)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


In [5]:
# embedding vector can be extracted by taking a dot product between the one_hot vector and embedding matrix
embedding_vector = np.dot(one_hot, embedding)
print(embedding_vector)

[0.60661999 0.79176031 0.84172283 0.90355146 0.82368189]


In [6]:
# cross validate from the embedding matrix
print(embedding[i])

[0.60661999 0.79176031 0.84172283 0.90355146 0.82368189]


## Tensorflow Embedding Lookup

In [7]:
g = tf.Graph()
with g.as_default():
    # provide input indices 
    x = tf.placeholder(shape=[None], dtype=tf.int32, name='x')
    
    # create a constant initializer
    weights_initializer = tf.constant_initializer(embedding)
    embedding_weights = tf.get_variable(
                            name='embedding_weights', 
                            shape=(vocabulary_size, embedding_size), 
                            initializer=weights_initializer,
                            trainable=False)
    # emebedding Lookup 
    embedding_lookup = tf.nn.embedding_lookup(embedding_weights, x)

In [8]:
# Getting Single Row
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embedding_lookup, feed_dict={x : [4]}))


[[0.60662    0.7917603  0.84172285 0.90355146 0.8236819 ]]


In [9]:
# Getting Multiple Rows
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(embedding_lookup, feed_dict={x : [2,4,6]}))



[[0.07006209 0.93486387 0.87660795 0.93495023 0.9324637 ]
 [0.60662    0.7917603  0.84172285 0.90355146 0.8236819 ]
 [0.34921163 0.17196946 0.7953417  0.39571297 0.4346808 ]]


### Using GloVe Pre-Trained Model 

In [10]:
EMBEDDING_DIMENSION=100 # Available dimensions for 6B data is 50, 100, 200, 300
glove_weights_file_path = os.path.join('processed','glove', 'glove.6B.{0}d.txt'.format(EMBEDDING_DIMENSION))
print('Using the following glove weight file : {0}'.format(glove_weights_file_path))

Using the following glove weight file : processed/glove/glove.6B.100d.txt


In [11]:
# look at some sample rows
!head -3 processed/glove/glove.6B.100d.txt

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062
, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158 

In [12]:
glove_weights = []
word2idx = {}
vocabulary_size = 40000 # limit vocab to top 40K terms
vocabulary = []


with open(glove_weights_file_path,'r') as file:
    for index, line in enumerate(file):
        values = line.split() # Word and weights separated by space
        word = values[0] # Word is first symbol on each line
        vocabulary.append(word)
        word_weights = np.asarray(values[1:], dtype=np.float32) # Remainder of line is weights for word
        word2idx[word] = index 
        glove_weights.append(word_weights)
        
        if index + 1 == vocabulary_size:
            break
glove_weights = np.asarray(glove_weights, dtype=np.float32)

In [13]:
glove_weights.shape

(40000, 100)

In [14]:
words = ["man", "woman"]
#words = ["paris", "london","rome","berlin"]
words_indices = [word2idx[word] for word in words]
words_indices

[300, 787]

In [15]:
g = tf.Graph()

with g.as_default():
    # provide input indices 
    x = tf.placeholder(shape=[None], dtype=tf.int32, name='x')
    
    # create a constant initializer
    weights_initializer = tf.constant_initializer(glove_weights)
    embedding_weights = tf.get_variable(
                            name='embedding_weights', 
                            shape=(vocabulary_size, EMBEDDING_DIMENSION), 
                            initializer=weights_initializer,
                            trainable=False)
    # emebedding Lookup 
    embedding_lookup = tf.nn.embedding_lookup(embedding_weights, x)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embedding_weights), 1, keepdims=True))
    normalized_embeddings = embedding_weights / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, x)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    


In [16]:
with tf.Session(graph=g) as sess:
    sess.run(tf.global_variables_initializer())
    result = sess.run(embedding_lookup, feed_dict={x : words_indices})
    sim = sess.run(similarity, feed_dict={x : words_indices})
    print('Shape of Similarity Matrix: {0}'.format(sim.shape))
    for i,word_index in enumerate(words_indices):
       
        top_k = 10 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to {0} :'.format(vocabulary[word_index])
        
        for k in range(top_k):
       
            close_word = vocabulary[nearest[k]]
            log = '{0} {1},'.format(log, close_word)
        print(log)


Shape of Similarity Matrix: (2, 40000)
Nearest to man : woman, boy, one, person, another, old, life, father, turned, who,
Nearest to woman : girl, man, mother, boy, she, child, wife, her, herself, daughter,
