In [1]:
import numpy as np
np.random.seed(46)

### Steps:

Generate random word vectors

Generate a random lexicon between them

Use the lexicon to update the original word vectors such that the new word vectors are close to the original word vectors and neighbours in the lexicon.

alpha is the parameter which specifies the closeness to the original vector

beta is the parameter which specifies the closeness to the neighbours in the lexicon

papar used an euclidian distance metric.

In [2]:
# random word vectors
word_vectors = np.random.randn(50,300)

In [3]:
# defined words
words = ["word_" + str(x) for x in range(50)]

In [4]:
words2index = {v:k for k, v in enumerate(words)}
index2words = {k:v for k, v in enumerate(words)}

In [5]:
# taking a sample of 5 words to create a lexicon
sampled_words_for_lexicon = np.random.randint(0,10,3)
sampled_words_for_lexicon_2 = np.random.randint(13,45,2)

In [6]:
sampled_words_for_lexicon

array([8, 1, 9])

In [7]:
sampled_words_for_lexicon_2

array([29, 16])

In [8]:
# random lexicon
lexicon = {}
for each_word in sampled_words_for_lexicon:
    # each word is connected all the other words in this sample
    to_connect = list(set(sampled_words_for_lexicon) - set([each_word]))
    lexicon["word_" + str(each_word)] = ["word_" + str(x) for x in to_connect]
    
for each_word in sampled_words_for_lexicon_2:
    # each word is connected all the other words in this sample
    to_connect = list(set(sampled_words_for_lexicon_2) - set([each_word]))
    lexicon["word_" + str(each_word)] = ["word_" + str(x) for x in to_connect]

In [9]:
lexicon

{'word_8': ['word_1', 'word_9'],
 'word_1': ['word_8', 'word_9'],
 'word_9': ['word_8', 'word_1'],
 'word_29': ['word_16'],
 'word_16': ['word_29']}

In [10]:
# converting lexicon into indices
lexicon_i = {}
for k, v in lexicon.items():
    lexicon_i[words2index[k]] = [words2index[x] for x in v]

In [11]:
lexicon_i

{8: [1, 9], 1: [8, 9], 9: [8, 1], 29: [16], 16: [29]}

In [156]:
# using lexicon to update word vectors
def retrofit(word_vectors, lexicon_i, n_iters= 10, alpha=0.95, beta=0.05):
    new_word_vectors = word_vectors.copy()
    words_in_lexicon = lexicon_i.keys()
    # n_iters is the num of steps of iterative update
    # alpha is the closeness to original
    # beta is the closeness to neighbours in the lexicon
    for each_iter in range(n_iters):
        # update the words only in the lexicon
        for each_word in words_in_lexicon:
            # update
            len_conn = len(lexicon_i[each_word])
            if len_conn > 0:
                new_word_vectors[each_word,:] = alpha * len_conn * new_word_vectors[each_word,:]
                for each_connection in lexicon_i[each_word]:
                    new_word_vectors[each_word,:] += beta * new_word_vectors[each_connection,:]
                # normalization
                new_word_vectors[each_word,:] = (new_word_vectors[each_word,:]) / len_conn * (alpha + beta)
    return new_word_vectors

In [157]:
lexicon_i

{8: [1, 9], 1: [8, 9], 9: [8, 1], 29: [16], 16: [29]}

In [158]:
new_word_vectors = retrofit(word_vectors, lexicon_i,  n_iters = 10)

In [159]:
#  did it work
# 8, 1, 9 should be closer
# checking distance between old vectors and comparing with distances between new vectors 
dist = np.linalg.norm(word_vectors[8,:] - word_vectors[1,:])
print("old distance {}".format(dist))

dist = np.linalg.norm(new_word_vectors[8,:] - new_word_vectors[1,:])
print("new distance {}".format(dist))


old distance 25.301451046423388
new distance 11.724400082997878


In [160]:
#  did it work
# 16 and 29 should be closer
# checking distance between old vectors and comparing with distances between new vectors 
dist = np.linalg.norm(word_vectors[16,:] - word_vectors[29,:])
print("old distance {}".format(dist))

dist = np.linalg.norm(new_word_vectors[16,:] - new_word_vectors[29,:])
print("new distance {}".format(dist))

old distance 23.402891658840684
new distance 8.389607203346678


In [161]:
lexicon_i

{8: [1, 9], 1: [8, 9], 9: [8, 1], 29: [16], 16: [29]}

In [162]:
# checking distance between old vector and updated new vector
distances = []
for each_word, _ in lexicon_i.items():
    distances.append(np.linalg.norm(new_word_vectors[each_word,:] - word_vectors[each_word,:]))

In [163]:
np.mean(distances)

7.7739978878082265