In [21]:
import numpy as np
class WordEmbeddings():
    def __init__(self, filename):
        self.filename = filename
        self.word_frequency = dict()
        self.word2id = dict()
        self.id2word = dict()
        self.vocab_size = 0
        self.embeddings = None
        for i, line in enumerate(open(self.filename, encoding="utf8")):
            if i == 0:
                line = line.split()
                self.vocab_size = int(line[0])
                self.embedding_size = int(line[1])
                self.embeddings = np.zeros((self.vocab_size, self.embedding_size))
                continue
            line = line.split(' ', 1)
            word = line[0]
            self.word2id[word] = i - 1
            self.id2word[i - 1] = word
            self.embeddings[i - 1, :] = np.fromstring(line[1], dtype=float, sep=' ')
    
    def embedding_for(self, word):
        ind = self.word2id[word]
        return self.embeddings[ind, :]
    

In [31]:
we_penn = WordEmbeddings("./embeddings/out_enwik8_penn_500dim_5wind.vec")
we_word2vec = WordEmbeddings("./embeddings/out_enwik8_w2v.vec")

In [71]:
from sklearn.metrics.pairwise import cosine_similarity
def vectorize_word(we, word):
    vec = None
    if isinstance(word, np.ndarray):
        vec = word
    if isinstance(word, str):
        vec = we.embedding_for(word)
    return vec


def similarity(vec1, vec2):
    vec = vec.reshape((1, -1))
    return cosine_similarity(vec, vec2)
  
    
def nearest_words(we, word, top_n=10):
    vec = vectorize_word(we, word)
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:top_n+1]
    neighbors = [(we.id2word[word_ind], cosines[0][word_ind]) for i, word_ind in enumerate(top10_ind)]
    return neighbors


def nearest_words_to_pairs_addition(we, word_a, word_b, top_n=10):
    vec_a = vectorize_word(we, word_a)
    vec_b = vectorize_word(we, word_b)
    vec = vec_a + vec_b
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:top_n+1]
    neighbors = [(we.id2word[word_ind], cosines[0][top10_ind[i]]) for i, word_ind in enumerate(top10_ind)]
    return neighbors


def best_cosine(we, ton_n=10):
    best_cosins = np.zeros(we.vocab_size, dtype="float")
    best_cos_pair = np.zeros(we.vocab_size, dtype="int64")
    for ind in we.word2id.values():
        #ind = 0
        vec = we.embeddings[ind, :]
        vec = vec.reshape((1, -1))
        cosines = cosine_similarity(vec, we.embeddings)
        #print(cosines)
        top = np.argsort(cosines)[0][::-1][1:top_n+1] 
        #print(top)
        word_id = None
        #if(top[0] == ind):
            #word_id = top[1]
        #else:
        word_id = top[0]
        #print(word_id)
        #print(we.id2word[word_id])
        #print(cos)
        best_cosins[ind] = cosines[0][word_id]
        best_cos_pair[ind] = word_id
    top_cos_args_id = np.argmax(best_cosins)
    best_pairs = [(we.id2word[ind], best_cos_pair[ind], best_cosins[ind]) 
                  for i, ind in enumerate(top_cos_args_id)]

    top_best_cos_pair = best_cos_pair[top_cos_args_id]
    
    
def compare(we_a, we_b, word_a, word_b):
    print("Words: {} + {}".format(word_a, word_b))
    print("Word2vec addition:\n", nearest_words_to_pairs_addition(we_a, word_b, word_a), "\n")
    print("PENN addition:\n", nearest_words_to_pairs_addition(we_b, word_b, word_a), "\n")

IndentationError: unexpected indent (<ipython-input-71-768981401c8f>, line 58)

In [64]:
best_cosine(we_word2vec)

[[ 1.          0.21555466  0.29981455 ... -0.01095824  0.03517678
   0.01968718]]
[   0  773  343 ... 7283 6648 6101]
773
capitalist
0.9329881297461052


In [67]:
nearest_words(we_word2vec, "anarchism")

[('capitalist', 0.9329881297461052),
 ('capitalism', 0.9026125478341888),
 ('anarchist', 0.8945130504426946),
 ('anarcho', 0.8944931630226096),
 ('libertarian', 0.8798084819004081),
 ('faire', 0.867901212119998),
 ('liberalism', 0.8544044630409507),
 ('laissez', 0.8279008774309387),
 ('rothbard', 0.8255568321545554),
 ('communism', 0.8160838067515472)]

In [68]:
nearest_words(we_penn, "anarchism")

[('individualist', 0.8593052069099795),
 ('anarchist', 0.8471530492275303),
 ('rothbard', 0.8316277232633484),
 ('metaphysical', 0.8207106134074114),
 ('zionism', 0.8126567946987308),
 ('rejection', 0.8126141317278897),
 ('authoritarian', 0.8109840579306388),
 ('contend', 0.8101501365925213),
 ('aclu', 0.8078392413248282),
 ('contradict', 0.8075189621300534)]

In [69]:
nearest_words_to_pairs_addition(we_word2vec, 'soviet', 'union')

[('soviet', 0.8919573507156756),
 ('guerrilla', 0.7668216671653967),
 ('warsaw', 0.7589129466196749),
 ('dissident', 0.7412740590799167),
 ('veteran', 0.7346475360194393),
 ('coup', 0.7303312014641579),
 ('liberate', 0.7273080260705842),
 ('neutrality', 0.7189481155538215),
 ('pact', 0.7172397340185512),
 ('ussr', 0.712835335414549)]

In [70]:
nearest_words_to_pairs_addition(we_penn, 'soviet', 'union')

[('union', 0.8449629208665597),
 ('confederate', 0.6656608921533554),
 ('slave', 0.6369609214171088),
 ('invasion', 0.6344407750103969),
 ('warsaw', 0.6328482311938771),
 ('ally', 0.6253420991806693),
 ('secession', 0.6177441658627295),
 ('occupation', 0.6123611501317596),
 ('seize', 0.598720415174633),
 ('liberty', 0.5952388067064349)]

In [48]:
compare(we_word2vec, we_penn, "boy", "girl")

Words: boy + girl
Word2vec addition:
 [('boy', 0.90739676267495), ('thirteen', 0.729378889200718), ('teenage', 0.7238851521057705), ('beautiful', 0.7080131873160405), ('rap', 0.7046934852354403), ('astro', 0.6931712679654263), ('marple', 0.69063418681593), ('sibling', 0.685468327135792), ('kid', 0.6823671121096673), ('chop', 0.6742604057487516)] 

PENN addition:
 [('girl', 0.8391505005872348), ('aisha', 0.7091762278700058), ('catherine', 0.7079567266385662), ('aphrodite', 0.7056407155880299), ('margaret', 0.6983328860558038), ('wicked', 0.6932060399283371), ('pregnant', 0.6914340390000767), ('gabriel', 0.6898858371339746), ('pretend', 0.6894373559795783), ('pitcher', 0.6885295168140809)] 

