In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import linecache
from pathlib import Path
from tqdm import tqdm
from sklearn import metrics
import sys

p = Path('.').resolve()
sys.path.append(str(p.parent))

In [2]:
from utils.faiss_utils import *
from utils.data_utils import *

Loading faiss with AVX2 support.


In [3]:
import faiss


def load_XY(basename):
    """
    Load embeddings (X) and possibly the
    labels (Y) of the graph {basename}.
    """
    model_path = Path("/data/models") / basename
    print("Loading data..")
    X, Y = load_data(model_path)
    classes = len(np.unique(Y))
    print("X shape: {}".format(X.shape))
    return X, Y


def centroid_neigh(basename, k_means, X, n=15):
    """
    Find the n-nearest neighbours to k-means
    cluster centroids.
    """
    d = X.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(X)
    D, I = index.search(k_means.centroids, n)
    entities = get_entities_list(basename)
    # find_neighbours(basename, I, entities)
    find_neighbours("itwiki-2013", I, entities)


def find_neighbours(basename, idx, entities):
    """
    Helper function for centroid_neigh.
    """
    urls_file = Path('/data/graphs/') / basename / (basename + '.ids')
    f = urls_file.as_posix()
    for pos, cluster in enumerate(idx):
        print("\x1b[0;35;43m Cluster {} \x1b[0m".format(pos))
        for node in cluster:
            line = entities[node]
            print(linecache.getline(f, line + 1))

Here we use the embeddings learnt on the Italian version of wikipedia from 2013. To learn these embeddings we (randomly) splitted the data into 10 partitions. We now only consider the first partition.

In [4]:
basename = "itwiki-2013_partitioned"
f = "/data/graphs/itwiki-2013/itwiki-2013.ids"

In [5]:
model_path = Path("/data/models") / basename
with (model_path / "entity_names_link_0.json").open() as tf:
    entities_list = json.load(tf)
hf_path = list(model_path.glob("embeddings_link_0*.h5"))[0]
hf = h5py.File(hf_path)
x = hf["embeddings"][:]
# idx = train_search(x)
# _, I = idx.search(x[0].reshape(1, -1), 20)
# for i in I.flatten():
#    line = int(entities_list[i]) + 1
#    print(linecache.getline(f, line))

To measure the quality of the clusters we will use the Silhouette score.

In [6]:
help(metrics.silhouette_score)

Help on function silhouette_score in module sklearn.metrics.cluster.unsupervised:

silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None, **kwds)
    Compute the mean Silhouette Coefficient of all samples.
    
    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (``a``) and the mean nearest-cluster distance (``b``) for each
    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
    b)``.  To clarify, ``b`` is the distance between a sample and the nearest
    cluster that the sample is not a part of.
    Note that Silhouette Coefficient is only defined if number of labels
    is 2 <= n_labels <= n_samples - 1.
    
    This function returns the mean Silhouette Coefficient over all samples.
    To obtain the values for each sample, use :func:`silhouette_samples`.
    
    The best value is 1 and the worst value is -1. Values near 0 indicate
    overlapping clusters. Negative values generally indicate tha

# 5 clusters

We use k-means with 5 centroids and then calculate the Silhouette score.

In [7]:
itwiki_kmeans = kmeans(x, 5, niter=100)
D, I = itwiki_kmeans.index.search(x, 1)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x, I.flatten()))

Silhouette Coefficient: 0.179


Let's see what are the nodes closest to clusters' centroids..

In [8]:
centroid_neigh("itwiki-2013_partitioned", itwiki_kmeans, x, n=10)

[0;35;43m Cluster 0 [0m
Park Sung-Wha

Viktor Vasin

Andrej Panavić

Bartosz Bosacki

Davy Schollen

Marc Schneider

Lee Tae-Ho

Jaroslav Šilhavý

Alex Morgan

César Ibáñez

[0;35;43m Cluster 1 [0m
Fort Wayne Pistons 1951-1952

XRR

Albatrellaceae

LUI

Circuito di Zeltweg

Nikon D2H

Lungotevere degli Altoviti

Baciami adesso (Mietta)

Discografia dei Negramaro

Core 2 Quad

[0;35;43m Cluster 2 [0m
Faglia di Cadillac-Larder Lake

Coal Bed Methane

Narcosi da azoto

Ripple mark

Picnoclino

Pompa petrolifera

Associazione Nazionale Istruttori Subacquei

Costa Hamakua

Haifa Chemicals

Nano (prefisso)

[0;35;43m Cluster 3 [0m
Surfin' Bird (singolo)

Scarecrow

Innocence

Copacabana (singolo)

The Mission (colonna sonora)

Big Time

Betrayed

Adrian Edmondson

Chain Reaction

Fight Club

[0;35;43m Cluster 4 [0m
Luigi Valadier

Collegio di Santa Croce

Cattedrale di Santa Maria Assunta (San Severo)

Emo (famiglia)

Pio Panfili

Paolo Pozzo

Giuseppe Zurlo

Chiesa di Nostra Signo

# 10 clusters

We use k-means with 10 centroids and then calculate the Silhouette score.

In [9]:
itwiki_kmeans = kmeans(x, 10, niter=100)
D, I = itwiki_kmeans.index.search(x, 1)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x, I.flatten()))

Silhouette Coefficient: 0.175


Let's see what are the nodes closest to clusters' centroids..

In [10]:
centroid_neigh("itwiki-2013_partitioned", itwiki_kmeans, x, n=5)

[0;35;43m Cluster 0 [0m
Adrien Decourcelle

Arthur Arnould

Karl von Abel

Lucien-Anatole Prévost-Paradol

Ernest Picard

[0;35;43m Cluster 1 [0m
A184

Panhard ERC

BTR-40

Force Aérienne Populaire de Benin

BA-10

[0;35;43m Cluster 2 [0m
Pericolosamente insieme

Bolero Extasy

Futureworld - 2000 anni nel futuro

L'esorcista III

La casa dei fantasmi

[0;35;43m Cluster 3 [0m
Carduus acanthoides

Sulpicio Alessandro

Charmahin

Discografia dei Negramaro

Albatrellaceae

[0;35;43m Cluster 4 [0m
Bwejuu

Lingue halmahera-cenderawasih

Bunguran

ISO 3166-2:MG

Limpopo (disambigua)

[0;35;43m Cluster 5 [0m
Trip hop

6 Feet Deep

Echoes, Silence, Patience & Grace

Cannibal Killers Live

It's Nothing

[0;35;43m Cluster 6 [0m
Viktor Vasin

Alex Morgan

Park Sung-Wha

Bartosz Bosacki

Davy Schollen

[0;35;43m Cluster 7 [0m
Collalbrigo

Cantalupo (Imola)

Castello di Buronzo

Monte Poggiolo

Duomo di Sacile

[0;35;43m Cluster 8 [0m
Foresta dell'Alto Palatinato

Ottendorf

Feldki

# 20 clusters

We use k-means with 20 centroids and then calculate the Silhouette score.

In [11]:
itwiki_kmeans = kmeans(x, 20, niter=100)
D, I = itwiki_kmeans.index.search(x, 1)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x, I.flatten()))

Silhouette Coefficient: 0.235


Let's see what are the nodes closest to clusters' centroids..

In [12]:
centroid_neigh("itwiki-2013_partitioned", itwiki_kmeans, x, n=5)

[0;35;43m Cluster 0 [0m
Campionati europei di atletica leggera 2012 - 5000 metri piani femminili

Wang Jie

Pallavolo ai Giochi della XXVII Olimpiade

Riccardo Lione

Pallavolo ai Giochi della XXVIII Olimpiade

[0;35;43m Cluster 1 [0m
Saint-Sigismond

Saint-Privat-des-Prés

Sarry (Saona e Loira)

Souspierre

Sainte-Colombe-sur-Seine

[0;35;43m Cluster 2 [0m
Midlothian (disambigua)

Avoca

Aberdeen (disambigua)

Northfield

Plymouth (disambigua)

[0;35;43m Cluster 3 [0m
Premi BAFTA 1954

Premi BAFTA 1953

Il lupo dei mari

La matadora

Margherita della notte

[0;35;43m Cluster 4 [0m
Reazione-diffusione

Bioinformatica

Evoluzione chimica

Dominio della frequenza

Cella primitiva

[0;35;43m Cluster 5 [0m
Strada statale 62 della Cisa

Passo del Lagastrello

Dialetto della Lunigiana

Strada statale 445 della Garfagnana

Savena

[0;35;43m Cluster 6 [0m
Levent Topsakal

Petko Lazarov

Chuck Mrazovich

Éric Beugnot

Gonzalo Sagi-Vela

[0;35;43m Cluster 7 [0m
20451 Galeotti

65

# 30 clusters

We use k-means with 30 centroids and then calculate the Silhouette score.

In [13]:
itwiki_kmeans = kmeans(x, 30, niter=100)
D, I = itwiki_kmeans.index.search(x, 1)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(x, I.flatten()))

Silhouette Coefficient: 0.230


Let's see what are the nodes closest to clusters' centroids..

In [14]:
centroid_neigh("itwiki-2013_partitioned", itwiki_kmeans, x, n=5)

[0;35;43m Cluster 0 [0m
RSS Persiana

Operazione Simoom

EUROMARFOR

Special Interrogation Group

Diritti umani in Finlandia

[0;35;43m Cluster 1 [0m
Marysville

Sumner (Washington)

White Oak

Des Moines (disambigua)

Contea di Clermont

[0;35;43m Cluster 2 [0m
Petăr Mihtarski

Convocazioni per il campionato europeo di calcio Under-21 2000

Valenciennes Football Club

Coppa dei Campioni 1987-1988

Supercoppa di Francia 1997

[0;35;43m Cluster 3 [0m
Ottendorf

Foresta dell'Alto Palatinato

Trebnitz

Breitenbach

Bronkow

[0;35;43m Cluster 4 [0m
Composizione della membrana cellulare

Fluoresceina sodica

Codone

Argirofilia

Superscan

[0;35;43m Cluster 5 [0m
Il lupo dei mari

La disperata notte

La matadora

Premi BAFTA 1954

Pinky, la negra bianca

[0;35;43m Cluster 6 [0m
Mildura Grand Tennis International 2011

Surbiton Trophy 2005

Challenger of Santa Clarita

Latrobe City Tennis International 2011

Rising Star Tour 2012

[0;35;43m Cluster 7 [0m
Canberra Challenger 1

# Map score

We will now compute the Mean Average Precision score on the embeddings obtained (considering only out nodes).

In [15]:
from measure_map import map_score

help(map_score)

Help on function map_score in module measure_map:

map_score(X, nodes, ind, neigh_num=50)
    Compute the map score of the given embedding.
    If the number of neighbours of the current node
    is bigger than the one given as input, returns
    the current node as an outlier.
    Input:
        - X (np.array), embeddings
        - nodes (list[list]), neighbours of each node
        - ind (faiss index), index used to compute L2
                            distances for the embeddings
        - neigh_num (int), number of neighbours considered
    Output:
        - score (float), map score
        - outliers (list)
        - singleton, number of singleton nodes



In [16]:
help(nodes_from_ascii)

Help on function nodes_from_ascii in module utils.data_utils:

nodes_from_ascii(basename, in_nodes=False)
    Read nodes from ascii file.
    Input:
        - basename (str), name of the graph
        - in_nodes (bool), if True return in_nodes
    Output:
        nodes (list), list of out_nodes
                    (in_nodes) if in_nodes=True



In [17]:
out_nodes = nodes_from_ascii("itwiki-2013")

1016867 vertices
reading..


100%|██████████| 1016867/1016867 [00:11<00:00, 91369.71it/s]

Found 8215 singleton nodes





In [18]:
entities_list = [int(i) for i in entities_list]

In [19]:
nodes = [out_nodes[i] for i in entities_list]

We need to retrieve the original ids and 

In [21]:
e_list_array = np.array(entities_list)

for i in nodes[0]:
    print(i, np.where(e_list_array == i)[0])

new_nodes = [list() for _ in nodes]
for pos, neigh in tqdm(enumerate(nodes)):
    for n in neigh:
        temp = np.where(e_list_array == n)[0]
        if len(temp) > 0:
            new_nodes[pos].append(temp[0])

289638 []
375342 []
375343 []
375347 []
378815 []
388620 []
388941 [70192]
389964 []
396900 []
522319 []
532258 []
760961 []
861289 []
867371 []
867398 []


In [25]:
idx = train_search(x)
score, a, b = map_score(x, new_nodes, idx)
score / len(x)

Index trained: True
Index total: 101618


0.023499976674571738

This isn't the exact map score, since we are not considering the outliers (nodes with more than 50 neighbours) and singletons (nodes with no neighbours).

How many outliers do we have?

In [26]:
len(a)

658

How many singletons?

In [27]:
b

28900

The issue on having so many singletons depends on the fact that partitioning the nodes, some neighbours nodes will end up in different partitions.



Just for curiosity, how many nodes in this partition have more than 1 neighbour?

In [28]:
count = 0
for i in new_nodes:
    if len(i) > 1:
        count += 1
count

48383

We can compute now the MAP score:

In [29]:
score / (len(x) - len(a) - b)

0.03313933707627853