In [1]:
%load_ext line_profiler

import numpy as np
import linecache
from pathlib import Path
from tqdm import tqdm

In [2]:
import sys

p = Path('.').resolve()
sys.path.append(str(p.parent))
sys.path

['/home/user/miniconda/envs/py36/lib/python36.zip',
 '/home/user/miniconda/envs/py36/lib/python3.6',
 '/home/user/miniconda/envs/py36/lib/python3.6/lib-dynload',
 '',
 '/home/user/miniconda/envs/py36/lib/python3.6/site-packages',
 '/home/user/miniconda/envs/py36/lib/python3.6/site-packages/IPython/extensions',
 '/home/user/.ipython',
 '/app/biggraph']

In [3]:
from utils.faiss_utils import train_search
from utils.data_utils import *

Loading faiss with AVX2 support.


In [4]:
def load_XY(basename):
    """
    Load embeddings (X) and possibly the
    labels (Y) of the graph {basename}.
    """
    model_path = Path("/data/models") / basename
    print("Loading data..")
    X, Y = load_data(model_path)
    classes = len(np.unique(Y))
    print("X shape: {}".format(X.shape))
    return X, Y


def check(nodes, k, emb, ind, f, ent_list):
    """
    nodes    - 2d array of nodes we want to check
    k        - nearest neighbours
    emb      - a 2-d numpy array of embeddings
    ind      - index built with faiss
    f        - file containing urls
    ent_list - list of entities id
    """
    if len(nodes) == 1:
        dist, ind = ind.search(nodes.reshape(1, -1), k)
    else:
        dist, ind = ind.search(nodes, k)
    for row in ind:
        source = int(ent_list[row[0]])
        print('\x1b[0;35;43m' + '{} nearest neighbours of node {}'.format(
            k - 1, source) + '\x1b[0m')
        print('\x1b[0;35;43m' + linecache.getline(f, source + 1) + '\x1b[0m')
        for node in row[1:]:
            neighbor = int(ent_list[node])
            print("  node {}, {}".format(
                node, linecache.getline(f, neighbor + 1)))

In [5]:
basename = "cnr-2000"
embeddings = load_XY(basename)
X = embeddings[0]
X[0]

Loading data..
Labels not defined
X shape: (325557, 64)


array([-0.0001718 ,  0.0087695 , -0.02964852, -0.01956584, -0.01648358,
       -0.00888848,  0.01993223, -0.01380344, -0.01544364, -0.00261269,
       -0.02519534,  0.004574  , -0.01579152,  0.00774693,  0.00024875,
       -0.0116387 ,  0.02761276,  0.00306257, -0.00216838,  0.01078875,
       -0.0264635 , -0.00381334,  0.01533207, -0.01215552, -0.01134059,
        0.01552172, -0.01898626, -0.00296632, -0.01627149,  0.01921202,
       -0.03555609,  0.00344647,  0.01180901,  0.00941313,  0.02471872,
       -0.04890871, -0.02605497,  0.01141761, -0.00338765,  0.0027696 ,
       -0.01621617,  0.03093353,  0.00815312, -0.00166487,  0.00991619,
       -0.02566224,  0.00927695,  0.00837251,  0.01326454,  0.0063187 ,
        0.02874073, -0.01070895, -0.00283655, -0.01973416,  0.00305778,
        0.01326166,  0.04523369,  0.01505135,  0.01444066,  0.01585885,
        0.01686572,  0.0199776 ,  0.00878629,  0.00935664], dtype=float32)

In [6]:
ent_list = get_entities_list(basename)
ent_list[:10]

[179413, 4123, 276766, 203406, 199888, 305090, 301572, 304259, 176797, 76956]

In [7]:
ind = train_search(X)

Index trained: True
Index total: 325557


In [8]:
urls_f = '/data/graphs/cnr-2000/cnr-2000.urls'
check(X[:2], 10, X, ind, urls_f, ent_list)

[0;35;43m9 nearest neighbours of node 179413[0m
[0;35;43mhttp://www.igbe.pv.cnr.it/posizioni/bandi2000/schema3.html
[0m
  node 159937, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema6.html

  node 289531, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema1.html

  node 263443, http://www.igbe.pv.cnr.it/posizioni/2001/schema05.html

  node 13645, http://www.igbe.pv.cnr.it/posizioni/2001/schema04.html

  node 306396, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema2.html

  node 101636, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema9.html

  node 8968, http://www.igbe.pv.cnr.it/posizioni/2003/bando_01_03.html

  node 305176, http://www.igbe.pv.cnr.it/posizioni/2001/schema02.html

  node 249529, http://www.igbe.pv.cnr.it/posizioni/bandi2000/assegno4.html

[0;35;43m9 nearest neighbours of node 4123[0m
[0;35;43mhttp://www.isti.cnr.it/Intranet/RISeT/
[0m
  node 314250, http://www.isti.cnr.it/Miscellanea/Other/

  node 36761, http://www.isti.cnr.it/Events/Courses/

  

In [9]:
perm = np.argsort(ent_list)
perm[:10]

array([ 41419,  89814, 151940,  77547, 263177, 277660, 308368,  78840,
       155331, 227672])

In [10]:
X = X[perm]
ind = train_search(X)
nodes = [179413, 4123]
check(X[nodes], 10, X, ind, urls_f, [i for i in range(len(X))])

Index trained: True
Index total: 325557
[0;35;43m9 nearest neighbours of node 179413[0m
[0;35;43mhttp://www.igbe.pv.cnr.it/posizioni/bandi2000/schema3.html
[0m
  node 179415, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema6.html

  node 179361, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema1.html

  node 179368, http://www.igbe.pv.cnr.it/posizioni/2001/schema05.html

  node 179401, http://www.igbe.pv.cnr.it/posizioni/2001/schema04.html

  node 179411, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema2.html

  node 179417, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema9.html

  node 179396, http://www.igbe.pv.cnr.it/posizioni/2003/bando_01_03.html

  node 179343, http://www.igbe.pv.cnr.it/posizioni/2001/schema02.html

  node 179409, http://www.igbe.pv.cnr.it/posizioni/bandi2000/assegno4.html

[0;35;43m9 nearest neighbours of node 4123[0m
[0;35;43mhttp://www.isti.cnr.it/Intranet/RISeT/
[0m
  node 3740, http://www.isti.cnr.it/Miscellanea/Other/

  node 3861, ht

In [None]:
def create(n, constructor=list):
    for _ in range(n):
        yield constructor()
        
        
def nodes_from_ascii(basename, in_nodes=False):
    ascii_path = Path("/data/graphs") / basename / ("ascii.graph-txt")
    assert ascii_path.exists(), "Graph not found!"
    with ascii_path.open() as f:
        line = f.readline()
        V = int(line.split()[0])
        print("{} vertices".format(V))
        print("reading..")
        nodes = list(create(V))
        singleton = 0
        for i in trange(V):
            line = f.readline()
            if line[0] == "\n":
                singleton +=1
            else:
                if in_nodes:
                    for node in line.split():
                        nodes[int(node)].append(i)
                else:
                    nodes[i] = [int(j) for j in line.split()]
        print("Found {} singleton nodes".format(singleton))
    return nodes

## Mean Average Precision

In [12]:
embeddings = load_XY(basename)
X = embeddings[0]
out_nodes = nodes_from_ascii(basename)
assert len(X) == len(out_nodes)
ent_list = get_entities_list(basename)
perm = np.argsort(ent_list)
X = X[perm]
ind = train_search(X)

Loading data..
Labels not defined
X shape: (325557, 64)
325557 vertices
reading..


100%|██████████| 325557/325557 [00:01<00:00, 192938.78it/s]


Index trained: True
Index total: 325557


In [17]:
def precision_score(node_ranking, neighs):
    """
    Compute the precision score as explained in
    https://dawn.cs.stanford.edu/2018/03/19/hyperbolics/
    Input:
        - node_ranking (np.array)
        - neighs (list)
    Output:
        - precision score (float)
    """
    # note: positions starting from 1 --> add 1
    neighs_ranks = np.in1d(node_ranking, neighs).nonzero()[0] + 1
    neighs_card = np.arange(len(neighs_ranks)) + 1
    node_score = neighs_card / neighs_ranks
    return node_score.sum() / len(neighs)


def map_score(X, out_nodes, ind, neigh_num=50):
    """
    Compute the map score of the given embedding.
    If the number of neighbours of the current node
    is bigger than the one given as input, returns
    the current node as an outlier.
    Input:
        - X (np.array), embeddings
        - nodes (list[list]), neighbours of each node
        - ind (faiss index), index used to compute L2
                            distances for the embeddings
        - neigh_num (int), number of neighbours considered
    Output:
        - score (float), map score
        - outliers (list)
        - singleton, number of singleton nodes
    """
    outliers = []
    score = 0
    singleton = 0
    _, ranking = ind.search(X, neigh_num)
    for node, neighs in enumerate(out_nodes):
        Na = len(neighs)
        if Na == 0:
            singleton += 1
        elif Na > neigh_num // 2:
            outliers.append(node)
        else:
            # start from index=1 to not consider the node itself
            Ra = ranking[node, 1:]
            score += precision_score(Ra, neighs)
    return score, outliers, singleton


In [15]:
n = 1000

In [18]:
map_score(X[:n], out_nodes[:n], ind)

(56.266225634464824,
 [403,
  562,
  595,
  600,
  650,
  652,
  653,
  669,
  670,
  671,
  677,
  687,
  688,
  689,
  690,
  691,
  697,
  698,
  699,
  700,
  702,
  705,
  706,
  707,
  711,
  712,
  714,
  716,
  717,
  718,
  757,
  796,
  797,
  799,
  802,
  805,
  810,
  812,
  816,
  817,
  818,
  828,
  852,
  853,
  854,
  855,
  856,
  857,
  859,
  860,
  861,
  863],
 306)

In [19]:
%lprun -f map_score map_score(X[:n], out_nodes[:n], ind)

## Map on all the dataset

In [20]:
def dataset_map(X, out_nodes, neighs=50):
    """
    Compute the MAP score on all the embeddings
    given as input.
    """
    ind = train_search(X)
    if len(X) > 10000:
        n = 10000
        iters = len(X) // n
        splits = np.array_split(X, iters)
        out_node_split = np.array_split(out_nodes, iters)
    else:
        iters = 1
        splits = X
        out_node_split = out_nodes
    score = 0
    singleton = 0
    for data, nodes in tqdm(zip(splits, out_node_split), total=iters):
        split_score, outliers, sing = map_score(
            data, nodes, ind, neigh_num=neighs)
        singleton += sing
        score += split_score
        while len(outliers) > 0:
            neighs *= 2
            split_score, outliers, _ = map_score(
                data[outliers], nodes[outliers], ind, neigh_num=neighs)
            score += split_score
    return score / (len(X) - singleton)



In [21]:
s = dataset_map(X, out_nodes)

Index trained: True
Index total: 325557


100%|██████████| 32/32 [02:45<00:00,  5.18s/it]


In [22]:
s

0.10081008928523795

## In nodes map

In [26]:
in_nodes = nodes_from_ascii("cnr-2000", in_nodes=True)

325557 vertices
reading..


100%|██████████| 325557/325557 [00:02<00:00, 158532.32it/s]


In [27]:
dataset_map(X, in_nodes)

Index trained: True
Index total: 325557


100%|██████████| 32/32 [09:39<00:00, 18.10s/it]


0.08374297111694988

## in_neighbours and out_neighbours simultaneously

In [28]:
out_nodes = nodes_from_ascii("cnr-2000")

325557 vertices
reading..


100%|██████████| 325557/325557 [00:01<00:00, 195031.31it/s]


In [29]:
out_nodes[:10]

[[1, 4, 8, 219, 220],
 [0, 7, 8, 219, 220],
 [3, 4, 8, 219, 220],
 [2, 8, 9, 219, 220],
 [0, 2, 8, 219, 220],
 [6, 8, 45, 219, 220],
 [5, 7, 8, 219, 220],
 [1, 6, 8, 219, 220],
 [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 54, 64, 146, 156],
 [3, 8, 10, 219, 220]]

In [30]:
in_nodes[:10]

[[1, 4, 8],
 [0, 7, 8],
 [3, 4, 8],
 [2, 8, 9],
 [0, 2, 8],
 [6, 8, 45],
 [5, 7, 8],
 [1, 6, 8],
 [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 54, 64],
 [3, 8, 10]]

In [31]:
n = len(out_nodes)
nodes = list(create(n))
for i in trange(n):
    nodes[i] = list(set(out_nodes[i]) | set(in_nodes[i]))

100%|██████████| 325557/325557 [00:01<00:00, 276473.75it/s]


In [32]:
nodes[0]

[1, 4, 8, 219, 220]

In [33]:
dataset_map(X, nodes)

Index trained: True
Index total: 325557


100%|██████████| 32/32 [10:33<00:00, 19.81s/it]


0.09755102669741487

### sanity check on the edges

In [34]:
edges = 0
for i in out_nodes:
    edges += len(i)
edges

3216152

In [35]:
edges = 0
for i in in_nodes:
    edges += len(i)
edges

3216152

### KarateClub

In [37]:
nodes_from_ascii("KarateClub")

AssertionError: Graph not found!

In [38]:
import re

karate = Path("/data/graphs/KarateClub/KarateClub.tab")
nodes = list(create(34))

with karate.open() as f:
    for line in f:
        two_nodes = re.findall("[0-9]+", line)
        node1, node2 = int(two_nodes[0]), int(two_nodes[1])
        nodes[node1].append(node2)
        
nodes[:5]

[[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31],
 [0, 2, 3, 7, 13, 17, 19, 21, 30],
 [0, 1, 3, 7, 8, 9, 13, 27, 28, 32],
 [0, 1, 2, 7, 12, 13],
 [0, 6, 10]]

In [39]:
basename = "KarateClub"

embeddings = load_XY(basename)
X = embeddings[0]
assert len(X) == len(nodes)
ent_list = get_entities_list(basename)
perm = np.argsort(ent_list)
X = X[perm]
ind = train_search(X)

Loading data..
Labels not defined
X shape: (34, 64)
Index trained: True
Index total: 34


In [40]:
_, I = ind.search(X, 34)

In [41]:
nodes[:5]

[[1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31],
 [0, 2, 3, 7, 13, 17, 19, 21, 30],
 [0, 1, 3, 7, 8, 9, 13, 27, 28, 32],
 [0, 1, 2, 7, 12, 13],
 [0, 6, 10]]

In [42]:
I[:5]

array([[ 0, 21, 17, 19,  5,  3,  7,  4, 16, 10, 13, 12, 24,  1, 31,  6,
        15,  2, 28, 32,  8,  9, 26, 20, 33, 25, 27, 14, 18, 30, 29, 22,
        23, 11],
       [ 1,  7, 13,  3, 21, 19, 17,  0,  2, 12,  8,  9,  5, 30, 28, 16,
         4, 15, 24, 31, 10, 32, 33, 20,  6, 27, 14, 26, 18, 22, 25, 29,
        23, 11],
       [ 2, 13,  9, 28,  7,  3,  8, 19,  1, 31, 21, 24, 33,  0, 30, 32,
        15, 27, 17,  5, 16, 12, 20, 14, 18, 25, 22, 26,  4, 29, 23, 10,
         6, 11],
       [ 3,  7, 13, 21,  0, 12, 19,  1, 17,  2,  5, 16, 28,  4,  9, 24,
        31,  8, 15, 10, 32, 33, 30, 27,  6, 20, 26, 14, 25, 18, 22, 29,
        23, 11],
       [ 4, 10,  5, 16,  6,  0, 17, 21, 19, 24,  3, 31, 15,  7, 12, 13,
        32, 26, 25, 28, 20, 14, 29, 18,  2,  8,  1, 33, 22,  9, 11, 27,
        23, 30]])

In [43]:
for i in range(34):
    print("node: {}, score: {}".format(i, precision_score(I[i, 1:], nodes[i])))

node: 0, score: 0.8953950665347725
node: 1, score: 0.9658119658119657
node: 2, score: 0.8678619909502261
node: 3, score: 0.8218253968253969
node: 4, score: 0.7000000000000001
node: 5, score: 0.7041666666666666
node: 6, score: 0.6791666666666667
node: 7, score: 0.6678571428571429
node: 8, score: 0.7559523809523809
node: 9, score: 0.7
node: 10, score: 0.7555555555555555
node: 11, score: 0.16666666666666666
node: 12, score: 0.8333333333333333
node: 13, score: 0.5176190476190475
node: 14, score: 0.29166666666666663
node: 15, score: 0.7
node: 16, score: 0.5909090909090909
node: 17, score: 0.34090909090909094
node: 18, score: 0.325
node: 19, score: 0.2609649122807018
node: 20, score: 0.29166666666666663
node: 21, score: 0.39285714285714285
node: 22, score: 0.325
node: 23, score: 0.8392857142857142
node: 24, score: 1.0
node: 25, score: 0.9166666666666666
node: 26, score: 0.625
node: 27, score: 0.4333333333333333
node: 28, score: 0.7222222222222222
node: 29, score: 0.8611111111111112
node: 30,

How to check if calculations are correct? 

Check the precision for node 11

In [44]:
nodes[11]

[0]

In [45]:
I[11]

array([11,  6, 10,  4,  5, 16,  0, 17, 12, 21, 24, 19, 31, 25, 15, 26,  3,
        7, 29, 32, 20, 14, 13, 18, 28, 22, 23, 27, 33,  1,  2,  8,  9, 30])

It should be 1/6 and indeed is 0.16

In [46]:
score, _, _ = map_score(X, nodes, ind, neigh_num=34)

In [47]:
score / 34

0.6587343742942676

# Write function

In [49]:
X.shape

(34, 64)

In [48]:
dataset_map(X, nodes, neighs=34)

  0%|          | 0/1 [00:00<?, ?it/s]

Index trained: True
Index total: 34





ValueError: not enough values to unpack (expected 2, got 1)