# ClusterNE

In [None]:
import numpy as np
from scipy import sparse
from matplotlib import pyplot as plt

In [None]:
from sknetwork.data import load_netset
from sknetwork.clustering import Louvain, BiLouvain
from sknetwork.linalg import normalize
from sknetwork.utils import membership_matrix, bipartite2undirected
from sknetwork.ranking import PageRank
from sknetwork.clustering.postprocess import reindex_labels

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

## Data

In [None]:
graph = load_netset('wikivitals')

In [None]:
adjacency = graph.adjacency
names = graph.names
labels = graph.labels
names_labels = graph.names_labels

## Embedding

In [None]:
def get_bilouvain_embedding(biadjacency, use_bilouvain=True, tol_aggregation=0.01, min_cluster_size=1):
    if use_bilouvain:
        bilouvain = BiLouvain(sort_clusters=False, tol_aggregation=tol_aggregation)
        labels_row = bilouvain.fit_transform(biadjacency)
        labels_col = bilouvain.labels_col_
    else:
        n_row, n_col = biadjacency.shape
        louvain = Louvain(sort_clusters=False, tol_aggregation=tol_aggregation)
        adjacency = bipartite2undirected(biadjacency)
        labels = louvain.fit_transform(adjacency)
        labels_row = labels[:n_row]
        labels_col = labels[n_row:]
    
    # remove singletons from column labels
    labels_unique, counts = np.unique(labels_col, return_counts=True)
    labels_new = -np.ones(max(labels_unique) + 1, dtype='int')
    labels_old = labels_unique[counts > min_cluster_size]
    labels_new[labels_old] = np.arange(len(labels_old))
    labels_col = labels_new[labels_col]

    # reindex row labels accordingly
    labels_unique = np.unique(labels_row)
    labels_new = -np.ones(max(labels_unique) + 1, dtype='int')
    labels_new[labels_old] = np.arange(len(labels_old))
    labels_row = labels_new[labels_row]
    
    # embedding
    probs = normalize(biadjacency)
    embedding_row = probs.dot(membership_matrix(labels_col)).toarray()
    probs = normalize(biadjacency.T)
    embedding_col = probs.dot(membership_matrix(labels_row)).toarray()
    return embedding_row, embedding_col, labels_col    

In [None]:
embedding_row, embedding_col, labels_pred = get_bilouvain_embedding(adjacency)

## Interpretation

In [None]:
names_labels_pred = ['Biology', 'World', 'History', 'Society', 'Arts & Media', 'Asia', 'Mathematics']
names_labels_pred += ['Physics', 'Geography',  'Philosophy',  'Ethnology']

In [None]:
names_labels_pred = np.array(names_labels_pred)

In [None]:
pr = PageRank()

In [None]:
for l in np.unique(labels_pred):
    if l >= 0:
        index = np.argwhere(labels_pred == l).ravel()
        scores = pr.fit_transform(adjacency, seeds = {i: 1 for i in index})
        print(l, len(index), names_labels_pred[l])
        print(names[index[np.argsort(-scores[index])[:10]]])

## Visualization

In [None]:
angles = [ 4 * np.pi / 3, np.pi / 2, -np.pi / 3]
basis = np.array([np.cos(angles), np.sin(angles)])

In [None]:
def visualize_dots(samples_, components, filename=None):
    COLORS = ['b', 'r', 'g']
    margin = 0.02
    plt.figure(figsize=(5,5))
    plt.axis('off')
    for i in range(3):
        for j in range(2):
            vector = np.zeros(3)
            vector[i] = 1
            x, y = basis.dot(vector)
            vector = np.zeros(3)
            vector[j] = 1
            x_, y_ = basis.dot(vector)        
            plt.plot([x, x_], [y, y_], color='k')
    for i, component in enumerate(components):
        vector = np.zeros(3)
        vector[i] = 1
        x, y = basis.dot(vector)
        plt.scatter(x, y, color=COLORS[i], s=200)
        x, y = basis.dot(vector * 1.15)
        plt.text(x - 3 * margin, y - margin, names_labels_pred[component], color='k', fontsize=16)
    for i, samples in enumerate(samples_):
        for s in samples:
            x, y = basis.dot(embedding_row[s][components])
            plt.scatter(x, y, color=COLORS[i])
            #plt.text(x + margin, y - 3 * margin, names[i])
    if filename is not None:
        plt.savefig(filename + '.pdf', bbox_inches='tight', transparent=True)
    plt.show()

## Society

In [None]:
components = np.array([9, 2, 4])
print(names_labels_pred[components])

In [None]:
labels_ = np.array([4, 1, 3])
print(names_labels[labels_])

In [None]:
samples_ = []
for label in labels_:
    index = np.argwhere((labels == label)).ravel()
    scores = pr.fit_transform(adjacency, seeds = {i: 1 for i in index})
    samples = index[np.argsort(-scores[index])[:100]]
    samples_.append(samples)

In [None]:
visualize_dots(samples_, components)

## Science

In [None]:
components = np.array([0, 7, 6])
print(names_labels_pred[components])

In [None]:
labels_ = np.array([7, 8, 10])
print(names_labels[labels_])

In [None]:
samples_ = []
for label in labels_:
    index = np.argwhere((labels == label)).ravel()
    scores = pr.fit_transform(adjacency, seeds = {i: 1 for i in index})
    samples = index[np.argsort(-scores[index])[:100]]
    samples_.append(samples)

In [None]:
visualize_dots(samples_, components)