The approximate answer from the Annoy index is significantly faster than the gensim index that provides exact results. And you can use this Annoy index for any high-dimensional, continuous, dense vectors that you need to search, such as LSA document-topic vectors or doc2vec document embeddings (vectors).

In [None]:
from nlpia.loaders import get_data

In [None]:
wv = get_data("word2vec")

In [None]:
len(wv), wv.vector_size

In [None]:
wv.vectors.shape

In [None]:
# This code won't work under Windows env (WTF???), but under Linux it will.
# The reason lib "annoy" doesn't compatible with Windows without ass penetration stuff.
from annoy import AnnoyIndex


num_words, num_dimensions = wv.vectors.shape
index = AnnoyIndex(num_dimensions)

In [None]:
from tqdm import tqdm


for i, word in enumerate(tqdm(wv.index2word)):
    index.add_item(i, wv[word])

In [None]:
import numpy as np
num_trees = int(np.log(num_words).round(0))
print(num_trees)

In [None]:
index.build(num_trees)
index.save("Word2vec_euc_index.ann")
w2id = dict(zip(range(len(wv)), wv))

## Look up a word from the vocab

In [None]:
wv['Harry_Potter'].index

In [None]:
wv['Harry_Potter'].count

In [None]:
w2id = dict(zip(wv, range(wv.vocab)))

In [None]:
w2id["Harry_Potter"]

In [None]:
ids = index.get_nns_by_item(w2id["Harry_Potter"], 11)
print(ids)

In [None]:
[wv[i] for i in ids]

In [None]:
[wv.index2word[i] for i in ids]

In [None]:
[word for word, similarity in wv.most_similar('Harry_Potter', topn=10)]

## Improve acc of annoy with using cosine (instead Euclidean) metric and more trees

In [None]:
index_cos = AnnoyIndex(f=num_dimensions, metric="angular")
# metric="angular" uses for angular (cosine) distance metric to compute your cluster and hashes
# Your options are: "angular", "euclidean", "manhattan", or "hamming"

for i, word in enumerate(wv.index2word):
    if not i % 100_000:
        print(f"{i}: {word}")
    index_cos.add_item(i, wv[word])

In [None]:
index_cos.build(30)
index_cos.save("word2vec_cos_index.ann")

In [None]:
ids_cos = index_cos.get_nns_by_item(w2id['Harry_Potter'], 10)
print(ids_cos)

In [None]:
# You'll get a bit other results than in book
# for repeatability use AnnoyIndex.set_seed()

[wv.index2word[i] for i in ids_cos]

In [None]:
import pandas as pd
pd.DataFrame(annoy_top10, columns=['annoy_15trees', 'annoy_30trees'])