In [1]:
import io
import gzip
import networkx as nx
import numpy as np
import pandas as pd
import requests
from sklearn import cluster, manifold, linear_model, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
import umap
# Silence perf warning
import warnings

import nodevectors as graph2vec
import csrgraph

warnings.simplefilter("ignore")

In [2]:
def make_blogcatalog(edgelist='./data/edges_blogcatalog.csv',
                    labels='./data/group_edges_blogcatalog.csv',
                    dedupe=True):
    """
    Graph with cluster labels from blogcatalog
    
    Dedupe: Whether to deduplicate results (else some nodes have multilabels)
    """
    G = nx.read_edgelist(edgelist, delimiter=',')
    labels = pd.read_csv(labels, header=None)
    labels.columns = ['node', 'label']
    labels = labels.sort_values(by='node').reset_index(drop=True)
    if dedupe:
        labels = labels.loc[~labels.node.duplicated()
                      ].reset_index(drop=True)
    labels.node = labels.node.astype(int)
    labels.label = labels.label.astype(int)
    return G, labels

def make_snap():
    """
    Graph from university emails, clustered by departments
    Data from http://snap.stanford.edu/data/email-Eu-core.html
    Edge list Format
    """
    res = requests.get('http://snap.stanford.edu/data/email-Eu-core.txt.gz', verify=False)
    edges = gzip.GzipFile(fileobj=io.BytesIO(res.content))
    edges = pd.read_csv(io.StringIO(edges.read().decode()), header=None, sep=' ')
    edges.columns = ['src', 'dest']
    # cluster labels per node
    res = requests.get('http://snap.stanford.edu/data/email-Eu-core-department-labels.txt.gz', verify=False)
    labels = gzip.GzipFile(fileobj=io.BytesIO(res.content))
    labels = pd.read_csv(io.StringIO(labels.read().decode()), header=None, sep=' ')
    labels.columns = ['node', 'cluster']
    G = nx.Graph()
    G.add_edges_from([(t.src, t.dest) for t in edges.itertuples()])
    return G, pd.DataFrame({'node': list(G), 'label': labels.cluster})

def cluster_graph(n_nodes, n_clusters, connections=1, drop_pct=0.1):
    """
    Makes distinct complete subgraphs
        connected by random paths
        
    n_nodes (int): number of nodes
    n_clusters (int): number of clusters
        This is also the number of disjoint subgraphs
    connections (int): number of random connections 
        These join the disjoint subgraphs
    """
    div = int(n_nodes / n_clusters)
    subgraph_sizes = [div] * n_clusters
    # last cluster has remainder nodes
    subgraph_sizes[-1] = subgraph_sizes[-1] + (n_nodes % n_clusters)
    # Make G from disjoint subgraphs
    G = nx.complete_graph(subgraph_sizes[0])
    for i in range(1, len(subgraph_sizes)):
        G = nx.disjoint_union(G, nx.complete_graph(subgraph_sizes[i]))
    # connecting paths
    for i in range(connections):
        while True:
            c1, c2 = np.random.randint(n_nodes, size=2)
            if G.has_edge(c1, c2):
                continue
            G.add_edge(c1, c2)
            break
    # Drop random edges
    n_edges = len(G.edges)
    to_remove=random.sample(G.edges(),
                            k=int(n_edges * drop_pct))
    G.remove_edges_from(to_remove)
    # Generate labels
    labels = []
    for i in range(len(subgraph_sizes)):
        labels.append([i] * subgraph_sizes[i])
    labels = sum(labels, [])
    assert len(labels) == n_nodes, f"{labels}"
    assert len(set(labels)) == n_clusters, f"{labels}"
    return G, pd.DataFrame({'node': list(G), 'label': pd.Series(labels)})

def evalClusteringOnLabels(clusters, groupLabels, verbose=True):
    results = []
    results.append(metrics.adjusted_mutual_info_score(clusters, groupLabels))
    results.append(metrics.adjusted_rand_score(clusters, groupLabels))
    results.append(metrics.fowlkes_mallows_score(clusters, groupLabels))
    if verbose:
        print("adj. MI score:   {0:.2f}".format(results[0]))
        print("adj. RAND score: {0:.2f}".format(results[1]))
        print("F-M score:       {0:.2f}".format(results[2]))
    return np.array(results)

def to_X(node_labels, embedder):
    """
    Takes a series of node names and returns matrix of embeddings
    """
    X = pd.DataFrame.from_records(
        node_labels.astype(type(list(G)[0])).apply(embedder.predict).values)
    return X

In [3]:
# G, labels = make_blogcatalog(dedupe=True)
# G, labels = cluster_graph(n_nodes=6000, n_clusters=75, connections=3000, drop_pct=0.80)
G, labels = make_snap()

y = labels.label
n_clusters = y.nunique()

# Gridsearch result table
res = pd.DataFrame(columns=['method', 'params', 'traintime', 
                            'F1', 'F1_test', 'MI', 'RAND', 'F-M'])

X_train, X_test, y_train, y_test = train_test_split(
    labels.node, labels.label, test_size=0.10, 
    random_state=33)

TODO: refactor analysis into functions

add link prediction

In [None]:
for TOL in [0.00001]:
 for LEARNING_RATE in [0.1]:
  for EMBED_SIZE in [128, 256]:
    start_t = time.time()
    embedder = graph2vec.Glove(
        n_components=EMBED_SIZE,
        tol=TOL,
        max_epoch=100_000,
        learning_rate=LEARNING_RATE, 
        max_loss=10.,
    )
    embedder.fit(G)
    train_t = time.time()
    print(f"Fit Embedder: {time.time() - start_t:.2f}")
    logit = linear_model.LogisticRegressionCV(cv=5, scoring='f1_macro',
                                              max_iter=3000,
                                              solver='lbfgs',
                                              multi_class='ovr')
    X_full = to_X(labels.node, embedder=embedder)
    scaler = StandardScaler().fit(X_full)
    logit.fit(scaler.transform(to_X(X_train, embedder=embedder)), y_train)
    score = logit.scores_[1].mean(axis=0).max()
    print(f"Trained: {time.time() - start_t:.2f}")
    print(f"TOL: {TOL}, LEARNING_RATE: {LEARNING_RATE}, embed: {EMBED_SIZE}"
      "\n------------")
    print(f'best CV score: {score :.4f}')
    test_score = metrics.f1_score(
        y_true=y_test,
        y_pred=logit.predict(scaler.transform(to_X(X_test, embedder=embedder))),
        average='macro'
    )
    print(f"test score: {test_score :.4f}")
    
    umpagglo = cluster.AgglomerativeClustering(
        n_clusters=n_clusters, 
        affinity='cosine', 
        linkage='average'
    ).fit(X_full).labels_

    x = evalClusteringOnLabels(umpagglo, labels.label)
    print("-------------------\n\n")

    res = res.append({
        'method': 'GLoVe', 
        'params':{
            'TOL': TOL,
            'LEARNING_RATE': LEARNING_RATE,
        }, 
        'traintime': train_t - start_t, 
        'F1': score, 
        'F1_test': test_score,
        'MI':x[0],
        'RAND':x[1], 
        'F-M':x[2]}, 
        ignore_index=True)

Fit Embedder: 1650.74
Trained: 1655.37
TOL: 1e-05, LEARNING_RATE: 0.1, embed: 16
------------
best CV score: 0.4822
test score: 0.0062
adj. MI score:   0.00
adj. RAND score: -0.00
F-M score:       0.14
-------------------




In [5]:
for MIN_DIST in [0.1, 0.01]:
 for N_NEIGHBORS in [15, 5]:
  for EMBED_SIZE in [64, 128]:
    start_t = time.time()
    embedder = graph2vec.SKLearnEmbedder(
        umap.UMAP,
        n_neighbors=N_NEIGHBORS,
        min_dist=MIN_DIST,
        metric='euclidean',
        n_components=EMBED_SIZE,
    )
    embedder.fit(G)
    train_t = time.time()
    print(f"Fit Embedder: {time.time() - start_t:.2f}")
    logit = linear_model.LogisticRegressionCV(cv=5, scoring='f1_macro',
                                              max_iter=3000,
                                              solver='lbfgs',
                                              multi_class='ovr')
    X_full = to_X(labels.node, embedder=embedder)
    scaler = StandardScaler().fit(X_full)
    logit.fit(scaler.transform(to_X(X_train, embedder=embedder)), y_train)
    score = logit.scores_[1].mean(axis=0).max()
    print(f"Trained: {time.time() - start_t:.2f}")
    print(f"Neighbors: {N_NEIGHBORS}, dist: {MIN_DIST}, embed: {EMBED_SIZE}"
      "\n------------")
    print(f'best CV score: {score :.4f}')
    test_score = metrics.f1_score(
        y_true=y_test,
        y_pred=logit.predict(scaler.transform(to_X(X_test, embedder=embedder))),
        average='macro'
    )
    print(f"test score: {test_score :.4f}")
    
    umpagglo = cluster.AgglomerativeClustering(
        n_clusters=n_clusters, 
        affinity='cosine', 
        linkage='average'
    ).fit(X_full).labels_

    x = evalClusteringOnLabels(umpagglo, labels.label)
    print("-------------------\n\n")

    res = res.append({
        'method': 'UMAP', 
        'params':{
            'n_neigbors': N_NEIGHBORS,
            'min_dist': MIN_DIST,
            'embed_size': EMBED_SIZE,
        }, 
        'traintime': train_t - start_t, 
        'F1': score, 
        'F1_test': test_score,
        'MI':x[0], 
        'RAND':x[1], 
        'F-M':x[2]}, 
        ignore_index=True)

Fit Embedder: 11.23
Trained: 105.63
Neighbors: 15, dist: 0.1, embed: 64
------------
best CV score: 0.8766
test score: 0.5021
adj. MI score:   0.66
adj. RAND score: 0.56
F-M score:       0.58
-------------------


Fit Embedder: 7.63
Trained: 149.04
Neighbors: 15, dist: 0.1, embed: 128
------------
best CV score: 0.8846
test score: 0.5056
adj. MI score:   0.66
adj. RAND score: 0.55
F-M score:       0.58
-------------------


Fit Embedder: 3.06
Trained: 106.41
Neighbors: 5, dist: 0.1, embed: 64
------------
best CV score: 0.8919
test score: 0.5038
adj. MI score:   0.65
adj. RAND score: 0.57
F-M score:       0.59
-------------------


Fit Embedder: 4.64
Trained: 143.88
Neighbors: 5, dist: 0.1, embed: 128
------------
best CV score: 0.8937
test score: 0.4210
adj. MI score:   0.65
adj. RAND score: 0.53
F-M score:       0.55
-------------------


Fit Embedder: 5.11
Trained: 98.08
Neighbors: 15, dist: 0.01, embed: 64
------------
best CV score: 0.8734
test score: 0.4352
adj. MI score:   0.65


In [9]:
for WALKLEN in [80]: # l in paper
 for EPOCH in [10]: # r in paper
  for N_WEIGHT in [0.3, 1., 3.]:
   for R_WEIGHT in [0.3, 1., 3.]:
    for WINDOW in [10]: # k in paper
     for EMBED_SIZE in [128]: # d in paper
      for NS_EXP in [0.75]: # default, not in paper
       for NEGATIVE in [5]: # default, not in paper
        start_t = time.time()
        embedder = graph2vec.Node2Vec(
            walklen=WALKLEN,
            epochs=EPOCH,
            return_weight=R_WEIGHT,
            neighbor_weight=N_WEIGHT,
            n_components=EMBED_SIZE,
            w2vparams={'window': WINDOW,
                       'negative': NEGATIVE, 
                       'iter': 5,
                       'ns_exponent': NS_EXP,
                       'batch_words': 128}
        )
        embedder.fit(G)
        train_t = time.time()
        print(f"Fit Embedder: {time.time() - start_t:.2f}")
        logit = linear_model.LogisticRegressionCV(cv=5, scoring='f1_macro',
                                                  max_iter=3000,
                                                  solver='lbfgs',
                                                  multi_class='ovr')
        X_full = to_X(labels.node, embedder=embedder)
        scaler = StandardScaler().fit(X_full)
        logit.fit(scaler.transform(to_X(X_train, embedder=embedder)), y_train)
        score = logit.scores_[1].mean(axis=0).max()
        print(f"Trained: {time.time() - start_t:.2f}")
        print(f'best CV score: {score :.4f}')
        test_score = metrics.f1_score(
            y_true=y_test,
            y_pred=logit.predict(scaler.transform(to_X(X_test, embedder=embedder))),
            average='macro'
        )
        print(f"test score: {test_score :.4f}")
        
        umpagglo = cluster.AgglomerativeClustering(
            n_clusters=n_clusters, 
            affinity='cosine', 
            linkage='average'
        ).fit(X_full).labels_

        x = evalClusteringOnLabels(umpagglo, labels.label)
        print("-------------------\n\n")

        res = res.append({
            'method': 'Node2Vec', 
            'params':{
                'walklen': WALKLEN,
                'epochs': EPOCH,
                'return_weight': R_WEIGHT,
                'neighbor_weight': N_WEIGHT,
                'window': WINDOW,
                'size': EMBED_SIZE, 
                'negative': NEGATIVE, 
                'iter': EPOCH,
                'ns_exponent': NS_EXP,
                'batch_words': 128,
            }, 
            'traintime': train_t - start_t, 
            'F1': score, 
            'F1_test': test_score,
            'MI':x[0], 
            'RAND':x[1], 
            'F-M':x[2]}, 
            ignore_index=True)

Making walks... Done, T=31.57
Mapping Walk Names... Done, T=0.95
Training W2V... Done, T=5.57
Fit Embedder: 38.27
Trained: 88.66
best CV score: 0.8746
test score: 0.5313
adj. MI score:   0.63
adj. RAND score: 0.51
F-M score:       0.54
-------------------


Making walks... Done, T=30.92
Mapping Walk Names... Done, T=0.92
Training W2V... Done, T=5.45
Fit Embedder: 37.32
Trained: 90.13
best CV score: 0.8477
test score: 0.4928
adj. MI score:   0.63
adj. RAND score: 0.53
F-M score:       0.55
-------------------


Making walks... Done, T=30.90
Mapping Walk Names... Done, T=0.88
Training W2V... Done, T=5.48
Fit Embedder: 37.38
Trained: 86.50
best CV score: 0.8523
test score: 0.4640
adj. MI score:   0.64
adj. RAND score: 0.55
F-M score:       0.57
-------------------


Making walks... Done, T=32.91
Mapping Walk Names... Done, T=0.90
Training W2V... Done, T=5.56
Fit Embedder: 39.42
Trained: 101.89
best CV score: 0.8551
test score: 0.5693
adj. MI score:   0.65
adj. RAND score: 0.55
F-M score: 

In [10]:
res.to_csv('umap_')

Unnamed: 0,method,params,traintime,F1,F1_test,MI,RAND,F-M
0,UMAP,"{'n_neigbors': 15, 'min_dist': 0.1, 'embed_siz...",11.227792,0.876551,0.502088,0.659598,0.563595,0.58405
1,UMAP,"{'n_neigbors': 15, 'min_dist': 0.1, 'embed_siz...",7.625471,0.884603,0.505623,0.660412,0.554303,0.575888
2,UMAP,"{'n_neigbors': 5, 'min_dist': 0.1, 'embed_size...",3.058904,0.891856,0.503792,0.64976,0.56857,0.588506
3,UMAP,"{'n_neigbors': 5, 'min_dist': 0.1, 'embed_size...",4.643819,0.89372,0.421005,0.649087,0.531919,0.554372
4,UMAP,"{'n_neigbors': 15, 'min_dist': 0.01, 'embed_si...",5.106231,0.873426,0.435216,0.649726,0.579084,0.598494
5,UMAP,"{'n_neigbors': 15, 'min_dist': 0.01, 'embed_si...",7.677146,0.878013,0.516135,0.658134,0.57922,0.598613
6,UMAP,"{'n_neigbors': 5, 'min_dist': 0.01, 'embed_siz...",3.071304,0.897846,0.440691,0.643065,0.576615,0.596205
7,UMAP,"{'n_neigbors': 5, 'min_dist': 0.01, 'embed_siz...",4.586399,0.874679,0.547567,0.652783,0.560834,0.581127
8,Node2Vec,"{'walklen': 80, 'epochs': 10, 'return_weight':...",38.265318,,,0.629246,0.51386,0.5448
9,Node2Vec,"{'walklen': 80, 'epochs': 10, 'return_weight':...",37.319187,,,0.631756,0.526646,0.553741
