In [2]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import linecache
from pathlib import Path
from tqdm import tqdm
import sys

p = Path('.').resolve()
sys.path.append(str(p.parent))

In [3]:
from utils.faiss_utils import *
from utils.data_utils import *

Loading faiss with AVX2 support.


In [17]:
import faiss


def load_XY(basename):
    """
    Load embeddings (X) and possibly the
    labels (Y) of the graph {basename}.
    """
    model_path = Path("/data/models") / basename
    print("Loading data..")
    X, Y = load_data(model_path)
    classes = len(np.unique(Y))
    print("X shape: {}".format(X.shape))
    return X, Y


def centroid_neigh(basename, k_means, X, n=15):
    """
    Find the n-nearest neighbours to k-means
    cluster centroids.
    """
    d = X.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(X)
    D, I = index.search(k_means.centroids, n)
    entities = get_entities_list(basename)
    print(entities[:10])
    # find_neighbours(basename, I, entities)
    find_neighbours("itwiki-2013", I, entities)


def find_neighbours(basename, idx, entities):
    """
    Helper function for centroid_neigh.
    """
    urls_file = Path('/data/graphs/') / basename / (basename + '.ids')
    f = urls_file.as_posix()
    for pos, cluster in enumerate(idx):
        print("\x1b[0;35;43m Cluster {} \x1b[0m".format(pos))
        for node in cluster:
            line = entities[node]
            print(linecache.getline(f, line + 1), entities[node])

In [11]:
basename = "itwiki-2013_partitioned"
f = "/data/graphs/itwiki-2013/itwiki-2013.ids"

In [12]:
model_path = Path("/data/models") / basename
with (model_path / "entity_names_link_0.json").open() as tf:
    entities_list = json.load(tf)
hf_path = list(model_path.glob("embeddings_link_0*.h5"))[0]
hf = h5py.File(hf_path)
x = hf["embeddings"][:]
idx = train_search(x)
_, I = idx.search(x[0].reshape(1, -1), 20)
for i in I.flatten():
    line = int(entities_list[i]) + 1
    print(linecache.getline(f, line))

Index trained: True
Index total: 101618
Platinum Dunes

Generazione perfetta

La figlia del mio capo

Movie 43

Big Mama - Tale padre, tale figlio

The Roommate - Il terrore ti dorme accanto

Il gatto... e il cappello matto

Gigolò per sbaglio

After.Life

David Twohy

Ti presento Bill

The Boondock Saints 2 - Il giorno di Ognissanti

Chestnut - Un eroe a quattro zampe

Accerchiato

Best Men - Amici per la pelle

The Weinstein Company

MTV Movie Awards 2002

One for the Money

Cenerentola e gli 007 nani

Ho rapito Sinatra



In [13]:
itwiki_kmeans = kmeans(x, 5, niter=100)

In [14]:
entities_list[:10]

['375341',
 '664430',
 '728407',
 '16627',
 '716009',
 '807080',
 '1011428',
 '230978',
 '426862',
 '381918']

In [18]:
centroid_neigh("itwiki-2013_partitioned", itwiki_kmeans, x, n=10)

[375341, 664430, 728407, 16627, 716009, 807080, 1011428, 230978, 426862, 381918]
[0;35;43m Cluster 0 [0m
Park Sung-Wha
 313535
Viktor Vasin
 314577
Andrej Panavić
 646953
Bartosz Bosacki
 284388
Davy Schollen
 647082
Marc Schneider
 541678
Lee Tae-Ho
 313487
Jaroslav Šilhavý
 284273
Alex Morgan
 760733
César Ibáñez
 292466
[0;35;43m Cluster 1 [0m
Fort Wayne Pistons 1951-1952
 929683
XRR
 995801
Albatrellaceae
 921440
LUI
 956500
Circuito di Zeltweg
 992024
Nikon D2H
 1001695
Lungotevere degli Altoviti
 957874
Baciami adesso (Mietta)
 951364
Discografia dei Negramaro
 950478
Core 2 Quad
 1009350
[0;35;43m Cluster 2 [0m
Faglia di Cadillac-Larder Lake
 463714
Coal Bed Methane
 732666
Narcosi da azoto
 791955
Ripple mark
 135958
Picnoclino
 139135
Pompa petrolifera
 787977
Associazione Nazionale Istruttori Subacquei
 791988
Costa Hamakua
 649205
Haifa Chemicals
 264541
Nano (prefisso)
 740598
[0;35;43m Cluster 3 [0m
Surfin' Bird (singolo)
 830379
Scarecrow
 719885
Innocence
 48445
