In [1]:
import numpy as np
import linecache
from pathlib import Path

In [2]:
import sys

p = Path('.').resolve()
sys.path.append(str(p.parent))
sys.path

['/home/user/miniconda/envs/py36/lib/python36.zip',
 '/home/user/miniconda/envs/py36/lib/python3.6',
 '/home/user/miniconda/envs/py36/lib/python3.6/lib-dynload',
 '',
 '/home/user/miniconda/envs/py36/lib/python3.6/site-packages',
 '/home/user/miniconda/envs/py36/lib/python3.6/site-packages/IPython/extensions',
 '/home/user/.ipython',
 '/app/biggraph']

In [3]:
from utils.faiss_utils import train_search
from utils.data_utils import *

Loading faiss with AVX2 support.


In [4]:
def load_XY(basename):
    """
    Load embeddings (X) and possibly the
    labels (Y) of the graph {basename}.
    """
    model_path = Path("/data/models") / basename
    print("Loading data..")
    X, Y = load_data(model_path)
    classes = len(np.unique(Y))
    print("X shape: {}".format(X.shape))
    return X, Y


def precision(Na, Ra_bi, bi):
    """
    Assuming Na and Ra_bi are 2
    arrays of indices.
    """
    set1 = set(Na)
    try:
        index = np.where(Ra_bi == bi)[0][0]
        set2 = set(Ra_bi[:index + 1])
        return len(set1.intersection(set2)) / len(set2)
    except IndexError:
        return 0


def map_score(graph):

    out_nodes, in_nodes, out_degree, in_degree = read_ascii_graph(graph)
    print("Computing map score")
    V = len(out_nodes)
    score = 0
    for node in graph:
        node_score = 0
        Na = len(out_nodes)
        for neighbor in node.neighbours:
            node_score += precision(node, neighbor)
        score += node_score / Na
    return score / V


def out_map_score(basename, ind):
    
    out_nodes = read_ascii(basename)
    print("Computing out nodes map score")
    score = 0
    for node, neighs in enumerate(out_nodes):
        node_score = 0
        Na = len(neighs)
        for neighbor in neighs:
            node_score += precision(node, neighbor)
        score += node_score / Na
    return score / V


def read_ascii(basename, rm_singleton=False):
    """

    """
    ascii_path = Path("/data/graphs") / basename / ("ascii.graph-txt")
    assert ascii_path.exists(), "Graph not found!"
    with ascii_path.open() as f:
        line = f.readline()
        V = int(line.split()[0])
        print("{} vertices".format(V))
        print("reading..")
        out_nodes = [0] * V
        for i in trange(V):
            line = f.readline()
            if line[0] == "\n" and not rm_singleton:
                # don't remove singleton
                # assume node is linked to itself
                out_ = np.array([i])
            else:
                out_ = np.fromstring(line, dtype=np.int32, sep=' ')
            out_nodes[i] = out_
    return out_nodes


def check(nodes, k, emb, ind, f, ent_list):
    """
    nodes    - 2d array of nodes we want to check
    k        - nearest neighbours
    emb      - a 2-d numpy array of embeddings
    ind      - index built with faiss
    f        - file containing urls
    ent_list - list of entities id
    """
    if len(nodes) == 1:
        dist, ind = ind.search(nodes.reshape(1, -1), k)
    else:
        dist, ind = ind.search(nodes, k)
    for row in ind:
        source = int(ent_list[row[0]])
        print('\x1b[0;35;43m' + '{} nearest neighbours of node {}'.format(
            k - 1, source) + '\x1b[0m')
        print('\x1b[0;35;43m' + linecache.getline(f, source + 1) + '\x1b[0m')
        for node in row[1:]:
            neighbor = int(ent_list[node])
            print("  node {}, {}".format(
                node, linecache.getline(f, neighbor + 1)))

In [5]:
basename = "cnr-2000"
embeddings = load_XY(basename)
X = embeddings[0]
X[0]

Loading data..
Labels not defined
X shape: (325557, 64)


array([-0.0001718 ,  0.0087695 , -0.02964852, -0.01956584, -0.01648358,
       -0.00888848,  0.01993223, -0.01380344, -0.01544364, -0.00261269,
       -0.02519534,  0.004574  , -0.01579152,  0.00774693,  0.00024875,
       -0.0116387 ,  0.02761276,  0.00306257, -0.00216838,  0.01078875,
       -0.0264635 , -0.00381334,  0.01533207, -0.01215552, -0.01134059,
        0.01552172, -0.01898626, -0.00296632, -0.01627149,  0.01921202,
       -0.03555609,  0.00344647,  0.01180901,  0.00941313,  0.02471872,
       -0.04890871, -0.02605497,  0.01141761, -0.00338765,  0.0027696 ,
       -0.01621617,  0.03093353,  0.00815312, -0.00166487,  0.00991619,
       -0.02566224,  0.00927695,  0.00837251,  0.01326454,  0.0063187 ,
        0.02874073, -0.01070895, -0.00283655, -0.01973416,  0.00305778,
        0.01326166,  0.04523369,  0.01505135,  0.01444066,  0.01585885,
        0.01686572,  0.0199776 ,  0.00878629,  0.00935664], dtype=float32)

In [6]:
ent_list = get_entities_list(basename)
ent_list[:10]

[179413, 4123, 276766, 203406, 199888, 305090, 301572, 304259, 176797, 76956]

In [7]:
ind = train_search(X)

Index trained: True
Index total: 325557


In [8]:
urls_f = '/data/graphs/cnr-2000/cnr-2000.urls'
check(X[:2], 10, X, ind, urls_f, ent_list)

[0;35;43m9 nearest neighbours of node 179413[0m
[0;35;43mhttp://www.igbe.pv.cnr.it/posizioni/bandi2000/schema3.html
[0m
  node 159937, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema6.html

  node 289531, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema1.html

  node 263443, http://www.igbe.pv.cnr.it/posizioni/2001/schema05.html

  node 13645, http://www.igbe.pv.cnr.it/posizioni/2001/schema04.html

  node 306396, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema2.html

  node 101636, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema9.html

  node 8968, http://www.igbe.pv.cnr.it/posizioni/2003/bando_01_03.html

  node 305176, http://www.igbe.pv.cnr.it/posizioni/2001/schema02.html

  node 249529, http://www.igbe.pv.cnr.it/posizioni/bandi2000/assegno4.html

[0;35;43m9 nearest neighbours of node 4123[0m
[0;35;43mhttp://www.isti.cnr.it/Intranet/RISeT/
[0m
  node 314250, http://www.isti.cnr.it/Miscellanea/Other/

  node 36761, http://www.isti.cnr.it/Events/Courses/

  

In [9]:
perm = np.argsort(ent_list)
perm[:10]

array([ 41419,  89814, 151940,  77547, 263177, 277660, 308368,  78840,
       155331, 227672])

In [10]:
X = X[perm]
ind = train_search(X)
nodes = [179413, 4123]
check(X[nodes], 10, X, ind, urls_f, [i for i in range(len(X))])

Index trained: True
Index total: 325557
[0;35;43m9 nearest neighbours of node 179413[0m
[0;35;43mhttp://www.igbe.pv.cnr.it/posizioni/bandi2000/schema3.html
[0m
  node 179415, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema6.html

  node 179361, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema1.html

  node 179368, http://www.igbe.pv.cnr.it/posizioni/2001/schema05.html

  node 179401, http://www.igbe.pv.cnr.it/posizioni/2001/schema04.html

  node 179411, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema2.html

  node 179417, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema9.html

  node 179396, http://www.igbe.pv.cnr.it/posizioni/2003/bando_01_03.html

  node 179343, http://www.igbe.pv.cnr.it/posizioni/2001/schema02.html

  node 179409, http://www.igbe.pv.cnr.it/posizioni/bandi2000/assegno4.html

[0;35;43m9 nearest neighbours of node 4123[0m
[0;35;43mhttp://www.isti.cnr.it/Intranet/RISeT/
[0m
  node 3740, http://www.isti.cnr.it/Miscellanea/Other/

  node 3861, ht

## Mean Average Precision

In [11]:
out_nodes = read_ascii(basename)

  0%|          | 0/325557 [00:00<?, ?it/s]

325557 vertices
reading..


100%|██████████| 325557/325557 [00:01<00:00, 225482.91it/s]


In [12]:
k = 500
xq = X[0].reshape(1, -1)
D, I = ind.search(xq, k)
I

array([[     0,    281,    187,    166,      5,    310,      4,    265,
           124,    207,      6,     87,    184,    142,    189,     20,
           147,    200,    185,    294,    186,     85,    122,     68,
             1,    303,     84,    286,     93,    302,    204,    194,
           243,     19,    298,    253,     90,    134,     14,    191,
           144,    216,     82,    212,    250,    205,     77,     78,
           152,    299,    255,     16,    127,    273,     45,     75,
            86,    129,     33,     21,    287,    238,     24,    300,
           153,    209,    115,     51,     60,    288,     66,     55,
           203,    197,    263,     74,    264,     73,    104,    225,
            17,    161,    267,    295,    163,    100,     48,    116,
           215,    248,    102,     57,     91,     83,     71,    280,
           103,    208,    169,      3,     38,    274,     12,     70,
            96,    236,    128,    290,    118,    188,     22, 

In [13]:
out_nodes[0]

array([  1,   4,   8, 219, 220], dtype=int32)

In [14]:
sum(np.in1d(I, out_nodes[0]))

5

In [15]:
for node in out_nodes[0]:
    print(precision(out_nodes[0], I.flatten(), node))

0.08
0.14285714285714285
0.014563106796116505
0.01818181818181818
0.022026431718061675


In [16]:
from tqdm import tqdm


def out_map_score(basename, k=500):
    
    embeddings = load_XY(basename)
    X = embeddings[0]
    out_nodes = read_ascii(basename)
    assert len(X) == len(out_nodes)
    ent_list = get_entities_list(basename)
    perm = np.argsort(ent_list)
    X = X[perm]
    ind = train_search(X)
    print("Computing out nodes map score")
    score = 0
    for node, neighs in enumerate(tqdm(out_nodes)):
        node_score = 0
        Na = len(neighs)
        xq = X[node].reshape(1, -1)
        D, I = ind.search(xq, k)
        Ra = I.flatten()[1:]
        for neighbor in neighs:
            node_score += precision(neighs, Ra, neighbor)
        score += node_score / Na
    return score / len(X)

In [17]:
map_score = out_map_score(basename, X)
print(map_score)

  5%|▍         | 16187/325557 [00:00<00:01, 161862.82it/s]

Loading data..
Labels not defined
X shape: (325557, 64)
325557 vertices
reading..


100%|██████████| 325557/325557 [00:01<00:00, 224691.31it/s]
  0%|          | 0/325557 [00:00<?, ?it/s]

Index trained: True
Index total: 325557
Computing out nodes map score





TypeError: only integer scalar arrays can be converted to a scalar index

In [18]:
embeddings = load_XY(basename)
X = embeddings[0]
out_nodes = read_ascii(basename)
assert len(X) == len(out_nodes)
ent_list = get_entities_list(basename)
perm = np.argsort(ent_list)
X = X[perm]
ind = train_search(X)

  4%|▍         | 13754/325557 [00:00<00:02, 137535.54it/s]

Loading data..
Labels not defined
X shape: (325557, 64)
325557 vertices
reading..


100%|██████████| 325557/325557 [00:01<00:00, 222220.04it/s]


Index trained: True
Index total: 325557


In [19]:
score = 0
for node, neighs in enumerate(tqdm(out_nodes)):
    node_score = 0
    Na = len(neighs)
    xq = X[node].reshape(1, -1)
    D, I = ind.search(xq, k)
    Ra = I.flatten()[1:]
    for neighbor in neighs:
        node_score += precision(neighs, Ra, neighbor)
    score += node_score / Na

  3%|▎         | 8317/325557 [01:43<1:01:33, 85.88it/s]

KeyboardInterrupt: 

In [20]:
%load_ext line_profiler


ModuleNotFoundError: No module named 'line_profiler'

  3%|▎         | 8317/325557 [02:00<1:01:33, 85.88it/s]