In [88]:
import numpy as np
from tqdm import tqdm
class WordEmbeddings():
    def __init__(self, filename):
        self.filename = filename
        self.word_frequency = dict()
        self.word2id = dict()
        self.id2word = dict()
        self.vocab_size = 0
        self.embeddings = None
        for i, line in enumerate(open(self.filename, encoding="utf8")):
            if i == 0:
                line = line.split()
                self.vocab_size = int(line[0])
                self.embedding_size = int(line[1])
                self.embeddings = np.zeros((self.vocab_size, self.embedding_size))
                continue
            line = line.split(' ', 1)
            word = line[0]
            self.word2id[word] = i - 1
            self.id2word[i - 1] = word
            self.embeddings[i - 1, :] = np.fromstring(line[1], dtype=float, sep=' ')
    
    def embedding_for(self, word):
        ind = self.word2id[word]
        return self.embeddings[ind, :]
    

In [31]:
we_penn = WordEmbeddings("./embeddings/out_enwik8_penn_500dim_5wind.vec")
we_word2vec = WordEmbeddings("./embeddings/out_enwik8_w2v.vec")

In [111]:
from sklearn.metrics.pairwise import cosine_similarity
def vectorize_word(we, word):
    vec = None
    if isinstance(word, np.ndarray):
        vec = word
    if isinstance(word, str):
        vec = we.embedding_for(word)
    return vec


def similarity(vec1, vec2):
    vec = vec.reshape((1, -1))
    return cosine_similarity(vec, vec2)
  
    
def nearest_words(we, word, top_n=10):
    vec = vectorize_word(we, word)
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:top_n+1]
    neighbors = [(we.id2word[word_ind], cosines[0][word_ind]) for i, word_ind in enumerate(top10_ind)]
    return neighbors


def nearest_words_to_pairs_addition(we, word_a, word_b, top_n=10):
    vec_a = vectorize_word(we, word_a)
    vec_b = vectorize_word(we, word_b)
    vec = vec_a + vec_b
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:top_n+1]
    neighbors = [(we.id2word[word_ind], cosines[0][word_ind]) for i, word_ind in enumerate(top10_ind)]
    return neighbors


def best_cosine(we, top_n=10):
    best_cosins = np.zeros(we.vocab_size, dtype="float")
    best_cos_pair = np.zeros(we.vocab_size, dtype="int64")
    for ind in tqdm(we.word2id.values()):
        #ind = 0
        if ind > 100:
            break
        vec = we.embeddings[ind, :]
        vec = vec.reshape((1, -1))
        cosines = cosine_similarity(vec, we.embeddings)
        word_id = np.argsort(cosines[0])[::-1][1] 
        best_cosins[ind] = cosines[0][word_id]
        best_cos_pair[ind] = word_id
    #print(best_cosins[:110])
    #print(best_cos_pair[:110])
    top_cos_args_id = np.argsort(best_cosins)[::-1][0:top_n]
    best_pairs = [(we.id2word[ind], we.id2word[best_cos_pair[ind]], best_cosins[ind]) 
                  for i, ind in enumerate(top_cos_args_id)]
    return best_pairs
    
    
def compare(we_a, we_b, word_a, word_b):
    print("Words: {} + {}".format(word_a, word_b))
    print("Word2vec addition:\n", nearest_words_to_pairs_addition(we_a, word_b, word_a), "\n")
    print("PENN addition:\n", nearest_words_to_pairs_addition(we_b, word_b, word_a), "\n")

In [112]:
best_cosine(we_word2vec)


  0%|                                                                                        | 0/11731 [00:00<?, ?it/s]
  0%|                                                                                | 2/11731 [00:00<19:12, 10.18it/s]
  0%|                                                                                | 3/11731 [00:00<20:24,  9.58it/s]
  0%|                                                                                | 4/11731 [00:00<21:03,  9.28it/s]
  0%|                                                                                | 5/11731 [00:00<21:59,  8.89it/s]
  0%|                                                                                | 6/11731 [00:00<22:10,  8.81it/s]
  0%|                                                                                | 7/11731 [00:00<23:03,  8.47it/s]
  0%|                                                                                | 8/11731 [00:00<23:20,  8.37it/s]
  0%|                                  

  1%|▌                                                                              | 80/11731 [00:08<20:32,  9.45it/s]
  1%|▌                                                                              | 81/11731 [00:09<20:21,  9.54it/s]
  1%|▌                                                                              | 83/11731 [00:09<20:57,  9.26it/s]
  1%|▌                                                                              | 84/11731 [00:09<21:41,  8.95it/s]
  1%|▌                                                                              | 85/11731 [00:09<22:43,  8.54it/s]
  1%|▌                                                                              | 86/11731 [00:09<22:38,  8.57it/s]
  1%|▌                                                                              | 87/11731 [00:09<22:14,  8.73it/s]
  1%|▌                                                                              | 88/11731 [00:09<21:57,  8.84it/s]
  1%|▌                                  

[0.93298813 0.76309549 0.63372499 0.83800261 0.67083864 0.63562201
 0.43639846 0.71308091 0.57885185 0.75340587 0.7413149  0.79158781
 0.78730249 0.73554745 0.86567987 0.62747356 0.73574825 0.61161891
 0.751909   0.64655957 0.64073225 0.79571915 0.56412411 0.50868156
 0.90302377 0.66214851 0.70006451 0.74045488 0.89959203 0.83687512
 0.7904317  0.80693749 0.86375043 0.75933341 0.87784    0.81496899
 0.76913132 0.80810431 0.80567724 0.83661703 0.77189962 0.86310027
 0.66213987 0.79323768 0.72163339 0.57224336 0.86088719 0.89531706
 0.92219036 0.67076019 0.64085034 0.86694866 0.63633569 0.80904913
 0.78105747 0.61128843 0.6734355  0.79133316 0.7462925  0.60183897
 0.65539938 0.92157905 0.70505401 0.820533   0.66682051 0.84294975
 0.76395816 0.9009661  0.55729284 0.73733714 0.71725512 0.44378967
 0.77532336 0.72631684 0.66861082 0.85185821 0.57343762 0.72027217
 0.70253603 0.82211399 0.7020259  0.65133724 0.71968435 0.6959185
 0.77394089 0.73702804 0.5170715  0.61408094 0.81649785 0.76346

[('anarchism', 'capitalist', 0.9329881297461052),
 ('authoritarian', 'ideological', 0.9221903590010347),
 ('voluntary', 'welfare', 0.9215790503997762),
 ('rothbard', 'anarcho', 0.9121954259992963),
 ('positive', 'negative', 0.9030237677260745),
 ('governance', 'welfare', 0.9009660997313156),
 ('anarchist', 'anarcho', 0.8995920334109205),
 ('elimination', 'additive', 0.8953170597637081),
 ('king', 'castile', 0.8778399954593179),
 ('anarchy', 'criticise', 0.8669486602767631)]

In [73]:
nearest_words(we_word2vec, "anarchism")

[('capitalist', 0.9329881297461052),
 ('capitalism', 0.9026125478341888),
 ('anarchist', 0.8945130504426946),
 ('anarcho', 0.8944931630226096),
 ('libertarian', 0.8798084819004081),
 ('faire', 0.867901212119998),
 ('liberalism', 0.8544044630409507),
 ('laissez', 0.8279008774309387),
 ('rothbard', 0.8255568321545554),
 ('communism', 0.8160838067515472)]

In [74]:
nearest_words(we_penn, "anarchism")

[('individualist', 0.8593052069099795),
 ('anarchist', 0.8471530492275303),
 ('rothbard', 0.8316277232633484),
 ('metaphysical', 0.8207106134074114),
 ('zionism', 0.8126567946987308),
 ('rejection', 0.8126141317278897),
 ('authoritarian', 0.8109840579306388),
 ('contend', 0.8101501365925213),
 ('aclu', 0.8078392413248282),
 ('contradict', 0.8075189621300534)]

In [78]:
nearest_words_to_pairs_addition(we_word2vec, 'soviet', 'union')

[('soviet', 0.8919573507156756),
 ('guerrilla', 0.7668216671653967),
 ('warsaw', 0.7589129466196749),
 ('dissident', 0.7412740590799167),
 ('veteran', 0.7346475360194393),
 ('coup', 0.7303312014641579),
 ('liberate', 0.7273080260705842),
 ('neutrality', 0.7189481155538215),
 ('pact', 0.7172397340185512),
 ('ussr', 0.712835335414549)]

In [76]:
nearest_words_to_pairs_addition(we_penn, 'soviet', 'union')

[('union', 0.8449629208665597),
 ('confederate', 0.6656608921533554),
 ('slave', 0.6369609214171088),
 ('invasion', 0.6344407750103969),
 ('warsaw', 0.6328482311938771),
 ('ally', 0.6253420991806693),
 ('secession', 0.6177441658627295),
 ('occupation', 0.6123611501317596),
 ('seize', 0.598720415174633),
 ('liberty', 0.5952388067064349)]

In [77]:
compare(we_word2vec, we_penn, "boy", "girl")

Words: boy + girl
Word2vec addition:
 [('boy', 0.90739676267495), ('thirteen', 0.729378889200718), ('teenage', 0.7238851521057705), ('beautiful', 0.7080131873160405), ('rap', 0.7046934852354403), ('astro', 0.6931712679654263), ('marple', 0.69063418681593), ('sibling', 0.685468327135792), ('kid', 0.6823671121096673), ('chop', 0.6742604057487516)] 

PENN addition:
 [('girl', 0.8391505005872348), ('aisha', 0.7091762278700058), ('catherine', 0.7079567266385662), ('aphrodite', 0.7056407155880299), ('margaret', 0.6983328860558038), ('wicked', 0.6932060399283371), ('pregnant', 0.6914340390000767), ('gabriel', 0.6898858371339746), ('pretend', 0.6894373559795783), ('pitcher', 0.6885295168140809)] 

