In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [4]:
import numpy as np
import torch
import sys

sys.path.append("../")

## Loading Model and Vocabulary

In [117]:
folder = "weights/cbow_WikiText2_minfreq_20"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = torch.load(f"../{folder}/model.pt", map_location=device)
vocab = torch.load(f"../{folder}/vocab.pt")

## Getting Embeddings

In [118]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
embed_norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
embed_norms = np.reshape(embed_norms, (len(embed_norms), 1))
embeddings = embeddings / embed_norms
embeddings.shape

(8130, 300)

In [119]:
# tokens from vocabulary
tokens = vocab.get_itos()
len(tokens)

8130

## Find Similar Words

In [120]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        dist = dists[sim_word_id]
        topN_dict[sim_word] = np.round(dist, 3)
    return topN_dict

In [142]:
get_top_similar("angle")

{'elevation': 0.561,
 'base': 0.534,
 'load': 0.53,
 'intersection': 0.515,
 'height': 0.513,
 'mouth': 0.509,
 'speed': 0.498,
 'teeth': 0.496,
 'start': 0.491,
 'clearing': 0.482}

## Vector Equations

In [123]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

king: 0.707
leo: 0.552
philip: 0.524
hairan: 0.522
inherited: 0.514
