In [7]:
from operator import itemgetter

import numpy as np


In [4]:
def LoadEmbedding(fname):
    with open(fname, 'r') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    return lines


def LoadEdges(fname):
    with open(fname, 'r') as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines]
    return lines


In [5]:
# Hyperbolic distance between two points

def dist(u, v):
    z = 2 * np.linalg.norm(u - v) ** 2
    uu = 1
    nu = np.linalg.norm(u)
    nv = np.linalg.norm(v)
    if nu != 1 and nv != 1:
        uu = 1. + z/((1-nu**2)*(1-nv**2))
    return np.arccosh(uu)


In [6]:
def ProcessEmbeddings(emb, dim):
    emb = emb[1:]
    embDict = dict()
    tau = 0.0
    for it, line in enumerate(emb):
        lst = line.split(',')
        currTau = np.float64(lst[-1])
        idx = np.int64(lst[0])
        embDict[idx] = np.asarray(lst[1:-1], dtype=np.float64)
        assert embDict[idx].shape[0] == dim
        tau = currTau
    return embDict, tau


def BuildWMatrix(embDict, dim):
    vocabSize = len(embDict)
    W = np.zeros((vocabSize, dim), dtype=np.float64)
    for idx, emb in embDict.items():
        W[idx, :] = emb
    return W, vocabSize


In [19]:
def build(femb, dim, key):
    emb = LoadEmbedding(femb)
    embDict, tau = ProcessEmbeddings(emb, dim)
    print(f"tau={tau}")
    W, vocabSize = BuildWMatrix(embDict, dim)

    word = key
    d = []
    wvec = embDict[word]
    for idx in range(W.shape[0]):
        vec = W[idx, :]
        if not np.equal(vec, wvec).all():
            d.append((idx, dist(wvec, vec) / tau))
    print()
    print()
    print(f"Relations for {word} in embedding using distance metric")
    bestMatches = []
    for k, v in sorted(d, key=itemgetter(1))[:5]:
        print(f"{k} {v}")
        bestMatches.append(k)
    print()
    print()
    return bestMatches

In [21]:
for j in range(2, 1000):
    bm = build('data/emb/1000r2.emb', 2, j)
    assert(bm[0] == 1)


tau=17.328679513998633


Relations for 2 in embedding using distance metric
1 0.9999999862428434
3 1.3349722032203768
0 1.3349722169562068
4 1.414971584988843
999 1.4149716324963437


tau=17.328679513998633


Relations for 3 in embedding using distance metric
1 0.9999999862641693
4 1.3349721557128762
2 1.3349722032203768
5 1.4149716326337285
0 1.4149716462748296


tau=17.328679513998633


Relations for 4 in embedding using distance metric
1 0.9999999387353377
3 1.3349721557128762
5 1.3349721558076049
6 1.4149709432704947
2 1.414971584988843


tau=17.328679513998633


Relations for 5 in embedding using distance metric
1 0.999999986358898
6 1.3349715615967575
4 1.3349721558076049
7 1.4149715855692004
3 1.4149716326337285


tau=17.328679513998633


Relations for 6 in embedding using distance metric
1 0.999999344524508
7 1.3349715144375007
5 1.3349715615967575
8 1.4149704211045433
4 1.4149709432704947


tau=17.328679513998633


Relations for 7 in embedding using distance metric
1 0.9999999

In [22]:
for j in range(2, 10000):
    bm = build('data/emb/10000r2.emb', 2, j)
    assert(bm[0] == 1)
    

tau=17.328679513998633


Relations for 2 in embedding using distance metric
1 1.0000000002386356
0 1.0691142415094506
3 1.0691142417035457
4 1.149114231125324
9999 1.1491142352725907


tau=17.328679513998633


Relations for 3 in embedding using distance metric
1 1.0000000001940954
4 1.0691142375562792
2 1.0691142417035457
5 1.1491142313004303
0 1.1491142349894152


tau=17.328679513998633


Relations for 4 in embedding using distance metric
1 0.9999999960913688
5 1.0691142336731991
3 1.0691142375562792
2 1.149114231125324
6 1.1491142329411144


tau=17.328679513998633


Relations for 5 in embedding using distance metric
1 0.9999999963110154
4 1.0691142336731991
6 1.0691142396362563
7 1.1491142241664514
3 1.1491142313004303


tau=17.328679513998633


Relations for 6 in embedding using distance metric
1 1.000000002054426
7 1.0691142363853572
5 1.0691142396362563
8 1.1491142279076445
4 1.1491142329411144


tau=17.328679513998633


Relations for 7 in embedding using distance metric
1 0.99999