In [1]:
%load_ext line_profiler

import numpy as np
import linecache
from pathlib import Path

In [2]:
import sys

p = Path('.').resolve()
sys.path.append(str(p.parent))
sys.path

['/home/user/miniconda/envs/py36/lib/python36.zip',
 '/home/user/miniconda/envs/py36/lib/python3.6',
 '/home/user/miniconda/envs/py36/lib/python3.6/lib-dynload',
 '',
 '/home/user/miniconda/envs/py36/lib/python3.6/site-packages',
 '/home/user/miniconda/envs/py36/lib/python3.6/site-packages/IPython/extensions',
 '/home/user/.ipython',
 '/app/biggraph']

In [3]:
from utils.faiss_utils import train_search
from utils.data_utils import *

Loading faiss with AVX2 support.


In [4]:
def load_XY(basename):
    """
    Load embeddings (X) and possibly the
    labels (Y) of the graph {basename}.
    """
    model_path = Path("/data/models") / basename
    print("Loading data..")
    X, Y = load_data(model_path)
    classes = len(np.unique(Y))
    print("X shape: {}".format(X.shape))
    return X, Y


def precision(Na, Ra_bi, bi):
    """
    Assuming Na and Ra_bi are 2
    arrays of indices.
    """
    set1 = set(Na)
    try:
        index = np.where(Ra_bi == bi)[0][0]
        set2 = set(Ra_bi[:index + 1])
        return len(set1.intersection(set2)) / len(set2)
    except IndexError:
        return 0


def map_score(graph):

    out_nodes, in_nodes, out_degree, in_degree = read_ascii_graph(graph)
    print("Computing map score")
    V = len(out_nodes)
    score = 0
    for node in graph:
        node_score = 0
        Na = len(out_nodes)
        for neighbor in node.neighbours:
            node_score += precision(node, neighbor)
        score += node_score / Na
    return score / V


def out_map_score(basename, ind):
    
    out_nodes = read_ascii(basename)
    print("Computing out nodes map score")
    score = 0
    for node, neighs in enumerate(out_nodes):
        node_score = 0
        Na = len(neighs)
        for neighbor in neighs:
            node_score += precision(node, neighbor)
        score += node_score / Na
    return score / V


def read_ascii(basename, rm_singleton=False):
    """

    """
    ascii_path = Path("/data/graphs") / basename / ("ascii.graph-txt")
    assert ascii_path.exists(), "Graph not found!"
    with ascii_path.open() as f:
        line = f.readline()
        V = int(line.split()[0])
        print("{} vertices".format(V))
        print("reading..")
        out_nodes = [0] * V
        for i in trange(V):
            line = f.readline()
            if line[0] == "\n" and not rm_singleton:
                # don't remove singleton
                # assume node is linked to itself
                out_ = np.array([i])
            else:
                out_ = np.fromstring(line, dtype=np.int32, sep=' ')
            out_nodes[i] = out_
    return out_nodes


def check(nodes, k, emb, ind, f, ent_list):
    """
    nodes    - 2d array of nodes we want to check
    k        - nearest neighbours
    emb      - a 2-d numpy array of embeddings
    ind      - index built with faiss
    f        - file containing urls
    ent_list - list of entities id
    """
    if len(nodes) == 1:
        dist, ind = ind.search(nodes.reshape(1, -1), k)
    else:
        dist, ind = ind.search(nodes, k)
    for row in ind:
        source = int(ent_list[row[0]])
        print('\x1b[0;35;43m' + '{} nearest neighbours of node {}'.format(
            k - 1, source) + '\x1b[0m')
        print('\x1b[0;35;43m' + linecache.getline(f, source + 1) + '\x1b[0m')
        for node in row[1:]:
            neighbor = int(ent_list[node])
            print("  node {}, {}".format(
                node, linecache.getline(f, neighbor + 1)))

In [5]:
basename = "cnr-2000"
embeddings = load_XY(basename)
X = embeddings[0]
X[0]

Loading data..
Labels not defined
X shape: (325557, 64)


array([-0.0001718 ,  0.0087695 , -0.02964852, -0.01956584, -0.01648358,
       -0.00888848,  0.01993223, -0.01380344, -0.01544364, -0.00261269,
       -0.02519534,  0.004574  , -0.01579152,  0.00774693,  0.00024875,
       -0.0116387 ,  0.02761276,  0.00306257, -0.00216838,  0.01078875,
       -0.0264635 , -0.00381334,  0.01533207, -0.01215552, -0.01134059,
        0.01552172, -0.01898626, -0.00296632, -0.01627149,  0.01921202,
       -0.03555609,  0.00344647,  0.01180901,  0.00941313,  0.02471872,
       -0.04890871, -0.02605497,  0.01141761, -0.00338765,  0.0027696 ,
       -0.01621617,  0.03093353,  0.00815312, -0.00166487,  0.00991619,
       -0.02566224,  0.00927695,  0.00837251,  0.01326454,  0.0063187 ,
        0.02874073, -0.01070895, -0.00283655, -0.01973416,  0.00305778,
        0.01326166,  0.04523369,  0.01505135,  0.01444066,  0.01585885,
        0.01686572,  0.0199776 ,  0.00878629,  0.00935664], dtype=float32)

In [6]:
ent_list = get_entities_list(basename)
ent_list[:10]

[179413, 4123, 276766, 203406, 199888, 305090, 301572, 304259, 176797, 76956]

In [7]:
ind = train_search(X)

Index trained: True
Index total: 325557


In [8]:
urls_f = '/data/graphs/cnr-2000/cnr-2000.urls'
check(X[:2], 10, X, ind, urls_f, ent_list)

[0;35;43m9 nearest neighbours of node 179413[0m
[0;35;43mhttp://www.igbe.pv.cnr.it/posizioni/bandi2000/schema3.html
[0m
  node 159937, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema6.html

  node 289531, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema1.html

  node 263443, http://www.igbe.pv.cnr.it/posizioni/2001/schema05.html

  node 13645, http://www.igbe.pv.cnr.it/posizioni/2001/schema04.html

  node 306396, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema2.html

  node 101636, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema9.html

  node 8968, http://www.igbe.pv.cnr.it/posizioni/2003/bando_01_03.html

  node 305176, http://www.igbe.pv.cnr.it/posizioni/2001/schema02.html

  node 249529, http://www.igbe.pv.cnr.it/posizioni/bandi2000/assegno4.html

[0;35;43m9 nearest neighbours of node 4123[0m
[0;35;43mhttp://www.isti.cnr.it/Intranet/RISeT/
[0m
  node 314250, http://www.isti.cnr.it/Miscellanea/Other/

  node 36761, http://www.isti.cnr.it/Events/Courses/

  

In [9]:
perm = np.argsort(ent_list)
perm[:10]

array([ 41419,  89814, 151940,  77547, 263177, 277660, 308368,  78840,
       155331, 227672])

In [10]:
X = X[perm]
ind = train_search(X)
nodes = [179413, 4123]
check(X[nodes], 10, X, ind, urls_f, [i for i in range(len(X))])

Index trained: True
Index total: 325557
[0;35;43m9 nearest neighbours of node 179413[0m
[0;35;43mhttp://www.igbe.pv.cnr.it/posizioni/bandi2000/schema3.html
[0m
  node 179415, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema6.html

  node 179361, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema1.html

  node 179368, http://www.igbe.pv.cnr.it/posizioni/2001/schema05.html

  node 179401, http://www.igbe.pv.cnr.it/posizioni/2001/schema04.html

  node 179411, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema2.html

  node 179417, http://www.igbe.pv.cnr.it/posizioni/bandi2000/schema9.html

  node 179396, http://www.igbe.pv.cnr.it/posizioni/2003/bando_01_03.html

  node 179343, http://www.igbe.pv.cnr.it/posizioni/2001/schema02.html

  node 179409, http://www.igbe.pv.cnr.it/posizioni/bandi2000/assegno4.html

[0;35;43m9 nearest neighbours of node 4123[0m
[0;35;43mhttp://www.isti.cnr.it/Intranet/RISeT/
[0m
  node 3740, http://www.isti.cnr.it/Miscellanea/Other/

  node 3861, ht

## Mean Average Precision

In [11]:
embeddings = load_XY(basename)
X = embeddings[0]
out_nodes = read_ascii(basename)
assert len(X) == len(out_nodes)
ent_list = get_entities_list(basename)
perm = np.argsort(ent_list)
X = X[perm]
ind = train_search(X)

  0%|          | 0/325557 [00:00<?, ?it/s]

Loading data..
Labels not defined
X shape: (325557, 64)
325557 vertices
reading..


100%|██████████| 325557/325557 [00:01<00:00, 224448.03it/s]


Index trained: True
Index total: 325557


In [12]:
from tqdm import tqdm


def np_precision(Na, Ra, bi):
    try:
        index = np.where(Ra == bi)[0][0]
        Ra_bi = Ra[:index + 1]
        intersection = len(np.intersect1d(Na, Ra_bi))
        return intersection / len(Ra_bi)
    except IndexError:
        return 0


def np_temp(X, out_nodes, ind):
    score = 0
    _, I = ind.search(X, 50)
    for node, neighs in enumerate(tqdm(out_nodes)):
        node_score = 0
        Na = len(neighs)
        if Na > 50:
            _, temp_I = ind.search(X[node].reshape(1, -1), Na*5)
            Ra_bi = temp_I.flatten()
        else:
            Ra_bi = I[node]
        for neighbor in neighs:
            node_score += np_precision(neighs, Ra_bi, neighbor)
        score += node_score / Na
    return score / len(out_nodes)


def precision_score(ind, neighs):
    neighs_ranks = np.in1d(ind, neighs).nonzero()[0] + 1
    neighs_card = np.arange(len(neighs_ranks)) + 1
    node_score = neighs_card / neighs_ranks
    return node_score.sum() / len(neighs)


def map_score(X, out_nodes, ind):
    score = 0
    _, I = ind.search(X, 50)
    for node, neighs in enumerate(tqdm(out_nodes)):
        node_score = 0
        Na = len(neighs)
        if Na > 50:
            _, temp_I = ind.search(X[node].reshape(1, -1), Na*5)
            Ra = temp_I.flatten()
        else:
            Ra = I[node]
        score += precision_score(I[node], neighs)
    return score / len(out_nodes)

In [13]:
n = 10000

In [14]:
np_temp(X[:n], out_nodes[:n], ind)

100%|██████████| 10000/10000 [00:05<00:00, 1936.25it/s]


0.4248102933981877

In [15]:
map_score(X[:n], out_nodes[:n], ind)

100%|██████████| 10000/10000 [00:01<00:00, 6216.72it/s]


0.422109098674668

In [16]:
%lprun -f np_temp np_temp(X[:n], out_nodes[:n], ind)

100%|██████████| 10000/10000 [00:08<00:00, 1172.38it/s]


In [17]:
%lprun -f map_score map_score(X[:n], out_nodes[:n], ind)

100%|██████████| 10000/10000 [00:02<00:00, 4604.61it/s]


## Map on all the dataset

In [39]:

def map_score(X, out_nodes, ind, neigh_num=50):
    outliers = []
    score = 0
    _, I = ind.search(X, neigh_num)
    for node, neighs in enumerate(tqdm(out_nodes)):
        node_score = 0
        Na = len(neighs)
        if Na > neigh_num // 2:
            outliers.append(node)
        else:
            Ra = I[node]
        score += precision_score(I[node], neighs)
    V = len(out_nodes) - len(outliers)
    return score / V, outliers


In [30]:
n = 10000
score, outliers = map_score(X[:n], out_nodes[:n], ind)

100%|██████████| 10000/10000 [00:00<00:00, 16683.50it/s]


In [31]:
score

0.43896536883804904

In [32]:
len(outliers)

384

In [33]:
outliers[:10]

[403, 562, 595, 600, 650, 652, 653, 669, 670, 671]

In [44]:
def map_score(X, out_nodes, ind, neigh_num=50):
    outliers = []
    score = 0
    _, I = ind.search(X, neigh_num)
    for node, neighs in enumerate(out_nodes):
        node_score = 0
        Na = len(neighs)
        if Na > neigh_num // 2:
            outliers.append(node)
        else:
            Ra = I[node]
        score += precision_score(I[node], neighs)
    return score, outliers


def dataset_map(X, out_nodes):
    
    ind = train_search(X)
    n = 10000
    iters = len(X) // n
    splits = np.array_split(X, iters)
    out_node_split = np.array_split(out_nodes, iters)
    score = 0
    for data, nodes in tqdm(zip(splits, out_node_split), total=iters):
        neighs = 50
        split_score, outliers = map_score(data, nodes, ind, neigh_num=neighs)
        score += split_score
        while len(outliers) > 0:
            neighs *= 2
            split_score, outliers = map_score(data[outliers], nodes[outliers], ind, neighs)
            score += split_score
    return score / len(X)

In [45]:
n = 100000
s = dataset_map(X[:n], out_nodes[:n])
s

  0%|          | 0/10 [00:00<?, ?it/s]

Index trained: True
Index total: 100000


100%|██████████| 10/10 [00:14<00:00,  1.43s/it]


0.345055162417955

In [46]:
s = dataset_map(X, out_nodes)

  0%|          | 0/32 [00:00<?, ?it/s]

Index trained: True
Index total: 325557


100%|██████████| 32/32 [01:23<00:00,  2.62s/it]


In [47]:
s

0.3631427064146329

## In nodes map

In [82]:
from collections import defaultdict



def read_ascii_in(basename, rm_singleton=False):
    """

    """
    ascii_path = Path("/data/graphs") / basename / ("ascii.graph-txt")
    assert ascii_path.exists(), "Graph not found!"
    with ascii_path.open() as f:
        line = f.readline()
        V = int(line.split()[0])
        print("{} vertices".format(V))
        print("reading..")
        in_nodes = defaultdict(list)
        for i in trange(V):
            line = f.readline()
            if line[0] == "\n" and not rm_singleton:
                # don't remove singleton
                # assume node is linked to itself
                continue
            else:
                for node in line.split():
                    in_nodes[int(node)].append(i)
    return in_nodes

In [83]:
a = read_ascii_in("cnr-2000")

  9%|▉         | 30804/325557 [00:00<00:02, 115175.04it/s]

325557 vertices
reading..


100%|██████████| 325557/325557 [00:02<00:00, 122054.81it/s]


In [85]:
for i in range(10):
    print(a[i])

[1, 4, 8]
[0, 7, 8]
[3, 4, 8]
[2, 8, 9]
[0, 2, 8]
[6, 8, 45]
[5, 7, 8]
[1, 6, 8]
[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 54, 64]
[3, 8, 10]
