In [1]:
import numpy as np
from tqdm import tqdm_notebook as tqdm
class WordEmbeddings():
    def __init__(self, filename):
        self.filename = filename
        self.word_frequency = dict()
        self.word2id = dict()
        self.id2word = dict()
        self.vocab_size = 0
        self.embeddings = None
        for i, line in enumerate(open(self.filename, encoding="utf8")):
            if i == 0:
                line = line.split()
                self.vocab_size = int(line[0])
                self.embedding_size = int(line[1])
                self.embeddings = np.zeros((self.vocab_size, self.embedding_size))
                continue
            line = line.split(' ', 1)
            word = line[0]
            self.word2id[word] = i - 1
            self.id2word[i - 1] = word
            self.embeddings[i - 1, :] = np.fromstring(line[1], dtype=float, sep=' ')
    
    def embedding_for(self, word):
        ind = self.word2id[word]
        return self.embeddings[ind, :]
    

In [12]:
we_penn = WordEmbeddings("./embeddings/out_enwik8_penn_500dim_5wind.vec")
we_word2vec = WordEmbeddings("./embeddings/out_enwik8_w2v.vec")

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
def vectorize_word(we, word):
    vec = None
    if isinstance(word, np.ndarray):
        vec = word
    if isinstance(word, str):
        vec = we.embedding_for(word)
    return vec


def similarity(vec1, vec2):
    vec = vec.reshape((1, -1))
    return cosine_similarity(vec, vec2)
  
    
def nearest_words(we, word, top_n=10):
    vec = vectorize_word(we, word)
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:top_n+1]
    neighbors = [(we.id2word[word_ind], cosines[0][word_ind]) for i, word_ind in enumerate(top10_ind)]
    return neighbors


def nearest_words_to_pairs_addition(we, word_a, word_b, top_n=10):
    vec_a = vectorize_word(we, word_a)
    vec_b = vectorize_word(we, word_b)
    vec = vec_a + vec_b
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:top_n+1]
    neighbors = [(we.id2word[word_ind], cosines[0][word_ind]) for i, word_ind in enumerate(top10_ind)]
    return neighbors


def best_cosine(we, top_n=10):
    best_cosins = np.zeros(we.vocab_size, dtype="float")
    best_cos_pair = np.zeros(we.vocab_size, dtype="int64")
    for ind in tqdm(we.word2id.values()):
        #ind = 0
        #if ind > 100:
            #break
        vec = we.embeddings[ind, :]
        vec = vec.reshape((1, -1))
        cosines = cosine_similarity(vec, we.embeddings)
        #word_id = np.argsort(cosines[0])[::-1][1] 
        word_id = np.argpartition(cosines[0],-2)[-2:][0]
        #print(np.argpartition(cosines[0],-2)[-2:])
        best_cosins[ind] = cosines[0][word_id]
        best_cos_pair[ind] = word_id
    #print(best_cosins[:110])
    #print(best_cos_pair[:110])
    top_cos_args_id = np.argsort(best_cosins)[::-1][0:top_n]
    best_pairs = [(we.id2word[ind], we.id2word[best_cos_pair[ind]], best_cosins[ind]) 
                  for i, ind in enumerate(top_cos_args_id)]
    return best_pairs
    
    
def compare(we_a, we_b, word_a, word_b):
    print("Words: {} + {}".format(word_a, word_b))
    print("Word2vec addition:\n", nearest_words_to_pairs_addition(we_a, word_b, word_a), "\n")
    print("PENN addition:\n", nearest_words_to_pairs_addition(we_b, word_b, word_a), "\n")

In [133]:
best_cosine(we_word2vec)

[('glycolysis', 'spotter', 0.9876011089363196),
 ('spotter', 'glycolysis', 0.9876011089363196),
 ('archaeoastronomy', 'spotter', 0.9874648536048432),
 ('kapoor', 'archaeoastronomy', 0.987102067265258),
 ('breakdanc', 'spotter', 0.9869388159123954),
 ('heckel', 'spotter', 0.9869177410172493),
 ('anoa', 'glycolysis', 0.9865203028149112),
 ('breakdance', 'spotter', 0.9863924243607858),
 ('abhidharma', 'glycolysis', 0.9862122248412486),
 ('chime', 'spotter', 0.9861884005240806)]

In [134]:
best_cosine(we_penn)

[('shine', 'boas', 0.9972787421258333),
 ('boas', 'shine', 0.9972787421258333),
 ('uke', 'shine', 0.9972310205652469),
 ('statistician', 'shine', 0.9971986678012923),
 ('cin', 'narnia', 0.9971982634743759),
 ('narnia', 'cin', 0.9971982634743759),
 ('belisarius', 'akkad', 0.9971779044314791),
 ('akkad', 'belisarius', 0.9971779044314791),
 ('abram', 'alexandrine', 0.9971627087764258),
 ('alexandrine', 'abram', 0.9971627087764258)]

In [249]:
nearest_words(we_word2vec, "fourth")

[('sixth', 0.6719917776923781),
 ('fifth', 0.6649193634769768),
 ('ninth', 0.6450301806544565),
 ('bah', 0.6432698042673961),
 ('coronation', 0.6260082211067375),
 ('ceremony', 0.6247439766376065),
 ('tenth', 0.600044981236633),
 ('commemorate', 0.5960178614840119),
 ('eighth', 0.5956747627772887),
 ('wait', 0.5913010259671003)]

In [250]:
nearest_words(we_penn, "fourth")

[('scot', 0.628178408225462),
 ('poland', 0.6197486665356933),
 ('milan', 0.6193810941023434),
 ('ratification', 0.6181803721903307),
 ('revolt', 0.6169001725420755),
 ('legion', 0.6150062996986057),
 ('abdicate', 0.6145488671710846),
 ('crusade', 0.6124782108609884),
 ('consul', 0.6094307030571164),
 ('quebec', 0.6082644753267177)]

In [15]:
nearest_words_to_pairs_addition(we_word2vec, 'boy', 'girl')

[('boy', 0.90739676267495),
 ('thirteen', 0.729378889200718),
 ('teenage', 0.7238851521057705),
 ('beautiful', 0.7080131873160405),
 ('rap', 0.7046934852354403),
 ('astro', 0.6931712679654263),
 ('marple', 0.69063418681593),
 ('sibling', 0.685468327135792),
 ('kid', 0.6823671121096673),
 ('chop', 0.6742604057487516)]

In [16]:
nearest_words_to_pairs_addition(we_penn, 'boy', 'girl')

[('girl', 0.8391505005872348),
 ('aisha', 0.7091762278700058),
 ('catherine', 0.7079567266385662),
 ('aphrodite', 0.7056407155880299),
 ('margaret', 0.6983328860558038),
 ('wicked', 0.6932060399283371),
 ('pregnant', 0.6914340390000767),
 ('gabriel', 0.6898858371339746),
 ('pretend', 0.6894373559795783),
 ('pitcher', 0.6885295168140809)]

In [14]:
compare(we_word2vec, we_penn, "boy", "girl")

Words: boy + girl
Word2vec addition:
 [('boy', 0.90739676267495), ('thirteen', 0.729378889200718), ('teenage', 0.7238851521057705), ('beautiful', 0.7080131873160405), ('rap', 0.7046934852354403), ('astro', 0.6931712679654263), ('marple', 0.69063418681593), ('sibling', 0.685468327135792), ('kid', 0.6823671121096673), ('chop', 0.6742604057487516)] 

PENN addition:
 [('girl', 0.8391505005872348), ('aisha', 0.7091762278700058), ('catherine', 0.7079567266385662), ('aphrodite', 0.7056407155880299), ('margaret', 0.6983328860558038), ('wicked', 0.6932060399283371), ('pregnant', 0.6914340390000767), ('gabriel', 0.6898858371339746), ('pretend', 0.6894373559795783), ('pitcher', 0.6885295168140809)] 



In [183]:
we_penn_ga = WordEmbeddings("./embeddings/out_penn_google_analogy_500dim_5w.vec")

In [184]:
nearest_words(we_penn_ga, "woman")

[('Damascus', 0.14716633719577282),
 ('Nicaragua', 0.13604063381083042),
 ('Malawi', 0.12357785844605604),
 ('Rome', 0.12042452563616046),
 ('Tirana', 0.10601653362005484),
 ('Denmark', 0.0984432185515497),
 ('girl', 0.09779731242189263),
 ('Colorado', 0.09714938057236729),
 ('Sweden', 0.09171789368034912),
 ('Alaska', 0.08876820343191807)]

In [189]:
def fff(we):
    vec_a = vectorize_word(we, "king")
    vec_b = vectorize_word(we, "queen")
    vec_c = vectorize_word(we, "woman")
    vec = vec_a - vec_b + vec_c
    vec = vec.reshape((1, -1))
    cosines = cosine_similarity(vec, we.embeddings)
    top10_ind = np.argsort(cosines)[0][::-1][1:10+1]
    neighbors = [(we.id2word[word_ind], cosines[0][word_ind]) for i, word_ind in enumerate(top10_ind)]
    print(neighbors)
fff(we_penn_ga)

[('woman', 0.5801384539889545), ('Zagreb', 0.14332272094655746), ('Malawi', 0.12356903196667185), ('Damascus', 0.10882978565101585), ('Sweden', 0.10253285283701401), ('Paramaribo', 0.10219196281670165), ('Egypt', 0.09706241482279429), ('Rome', 0.09327041103238491), ('hryvnia', 0.0922410525632594), ('Astana', 0.0889859372757664)]


In [186]:
we_w2v_ga = WordEmbeddings("./embeddings/out_w2v_google_analogy_500dim_5w.vec")

In [3]:
from diem.diem import Diem
diem = Diem("./embeddings/out_diem.vec")
diem.learn_from_text("./data/enwik8_shorter_cleaned.txt")

In [10]:
nearest_words(diem, "general")

[('mineral', 0.999719220718427),
 ('cerebral', 0.9995886006419438),
 ('generous', 0.9995176969374526),
 ('liberal', 0.9994882096291209),
 ('renewal', 0.9994664576392052),
 ('generic', 0.9994605577604865),
 ('deliver', 0.9994532622650988),
 ('revival', 0.9994483966530653),
 ('beverly', 0.9994041409082012),
 ('rearden', 0.9993923162858849)]

In [11]:
nearest_words_to_pairs_addition(diem, 'boy', 'girl')

[('foul', 0.9987633315785385),
 ('pour', 0.9985929568619805),
 ('paul', 0.9985834929228972),
 ('gil', 0.9985640825607383),
 ('kirk', 0.9985392757949823),
 ('lyon', 0.9985130765569822),
 ('baby', 0.9984835679902364),
 ('dual', 0.9984803096283539),
 ('loud', 0.9984690391061233),
 ('milk', 0.9984578617722678)]