In [1]:
import numpy as np
np.random.seed(46)

In [2]:
# got a lexicon from the internet
import csv
with open("framenet.txt",'r') as f:
    lines = f.readlines()

In [3]:
lines = [x.strip() for x in lines]

In [4]:
lines[0]

'source'

In [5]:
# creating lexicon 
lexicon = {}
for each_line in lines:
    words = each_line.split(' ')
    if len(words) > 1:
        lexicon[words[0]] = words[1:]

In [6]:
len(lexicon)

7855

In [7]:
from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300-SLIM.bin', binary=True)

In [8]:
model.vectors.shape

(299567, 300)

In [9]:
word_vectors = model.vectors

In [10]:
vocab= list(model.vocab.keys())

In [11]:
# match between lexicon and voacb
len(set(vocab).intersection(set(list(lexicon.keys()))))

7501

In [12]:
words2index = {v:k for k, v in enumerate(vocab)}
index2words = {k:v for k, v in enumerate(vocab)}

In [13]:
# converting lexicon into indices
lexicon_i = {}
for k, v in lexicon.items():
    if k in vocab:
        temp = [words2index.get(x,'') for x in v]
        temp = [x for x in temp if x]
        lexicon_i[words2index[k]] = temp

In [14]:
index2words[30168]

'sprinkle'

In [15]:
lexicon[index2words[30168]]

['rainfall',
 'downpour',
 'snow',
 'snowfall',
 'drizzle',
 'sleet',
 'shower',
 'rain',
 'torrential',
 'precipitation',
 'torrent',
 'hail']

In [16]:
lexicon_i[30168]

[10233,
 29030,
 2322,
 15580,
 27278,
 27959,
 9168,
 2209,
 74333,
 15151,
 25468,
 12145]

In [17]:
# using lexicon to update word vectors
def retrofit(word_vectors, lexicon_i, n_iters= 10, alpha=0.95, beta=0.05):
    new_word_vectors = word_vectors.copy()
    words_in_lexicon = lexicon_i.keys()
    # n_iters is the num of steps of iterative update
    # alpha is the closeness to original
    # beta is the closeness to neighbours in the lexicon
    for each_iter in range(n_iters):
        # update the words only in the lexicon
        for each_word in words_in_lexicon:
            # update
            len_conn = len(lexicon_i[each_word])
            if len_conn > 0:
                new_word_vectors[each_word,:] = alpha * len_conn * new_word_vectors[each_word,:]
                for each_connection in lexicon_i[each_word]:
                    new_word_vectors[each_word,:] += beta * new_word_vectors[each_connection,:]
                # normalization
                new_word_vectors[each_word,:] = (new_word_vectors[each_word,:]) / len_conn * (alpha + beta)
    return new_word_vectors

In [19]:
new_word_vectors = retrofit(word_vectors, lexicon_i,  n_iters = 10, alpha =0.95, beta= 0.05)

In [20]:
new_word_vectors.shape

(299567, 300)

In [21]:
import copy

In [22]:
new_model = copy.deepcopy(model)

In [23]:
new_model.vectors = new_word_vectors

In [30]:
print("Word: {}".format(index2words[30168]))
print("Words connected in lexicon: {}".format(lexicon[index2words[30168]]))

Word: sprinkle
Words connected in lexicon: ['rainfall', 'downpour', 'snow', 'snowfall', 'drizzle', 'sleet', 'shower', 'rain', 'torrential', 'precipitation', 'torrent', 'hail']


### are the words in lexicon at a lesser distance in retrofit model

yes

In [25]:
new_model.most_similar("sprinkle",topn=30)

[('drizzle', 0.737594485282898),
 ('sprinkling', 0.7267799973487854),
 ('sprinkles', 0.6991304159164429),
 ('sprinkled', 0.6447292566299438),
 ('rain', 0.618393063545227),
 ('moistened', 0.6180086731910706),
 ('drizzled', 0.6157044172286987),
 ('Sprinkle', 0.6119532585144043),
 ('moisten', 0.6049048900604248),
 ('shower', 0.6036059856414795),
 ('sleet', 0.6020184755325317),
 ('drench', 0.5991118550300598),
 ('tablespoon', 0.597425639629364),
 ('downpour', 0.5962713956832886),
 ('drizzling', 0.5769487619400024),
 ('dab', 0.57546067237854),
 ('Moisten', 0.5751141905784607),
 ('glaze', 0.5705564022064209),
 ('precipitation', 0.5693178176879883),
 ('soak', 0.5669090151786804),
 ('tbsp', 0.5659164190292358),
 ('pour', 0.5633883476257324),
 ('tsp', 0.5618157386779785),
 ('snow', 0.559573769569397),
 ('Drizzle', 0.558053731918335),
 ('spoonful', 0.5555875897407532),
 ('cupfuls', 0.5552812814712524),
 ('teaspoon', 0.5501717329025269),
 ('rainfall', 0.549561083316803),
 ('hail', 0.5488933324813

### are the original embeddings preserved?

yes

sprinkling original cosine is 0.73, in the retrofit model the distance is 0.72

In [26]:
model.most_similar("sprinkle",topn=15)

[('sprinkling', 0.7351713180541992),
 ('sprinkled', 0.6881633400917053),
 ('Sprinkle', 0.6744049191474915),
 ('sprinkles', 0.6474019885063171),
 ('tablespoon', 0.5909141302108765),
 ('tsp', 0.5867205858230591),
 ('tbsp', 0.5865833163261414),
 ('Garnish', 0.5789077281951904),
 ('Moisten', 0.5742171406745911),
 ('dollop', 0.573137104511261),
 ('drizzled', 0.563011646270752),
 ('tablespoons', 0.5598653554916382),
 ('Tbsp', 0.5437169075012207),
 ('spoonful', 0.5397160053253174),
 ('Drizzle', 0.5386276245117188)]

In [31]:
new_model.similarity("sprinkle","sprinkling")

0.72677994

In [32]:
#  did it work
# checking distance between old vectors and comparing with distances between new vectors 
dist = np.linalg.norm(word_vectors[30168,:] - word_vectors[10233,:])
print("old distance {}".format(dist))

dist = np.linalg.norm(new_word_vectors[30168,:] - new_word_vectors[10233,:])
print("new distance {}".format(dist))


old distance 1.2846770286560059
new distance 0.7379277348518372


In [33]:
# checking distance between old vector and updated new vector
distances = []
for each_word, _ in lexicon_i.items():
    distances.append(np.linalg.norm(new_word_vectors[each_word,:] - word_vectors[each_word,:]))

In [34]:
np.mean(distances)

0.3467867