In [1]:
import networkx as nx
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
import umap
# Silence perf warning
import warnings
from numba.errors import NumbaPerformanceWarning
warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
warnings.filterwarnings("ignore")

import graph2vec
import csrgraph

In [2]:
def make_blogcatalog(edgelist='./data/edges_blogcatalog.csv',
                    labels='./data/group_edges_blogcatalog.csv'):
    """
    Graph with cluster labels from blogcatalog
    """
    G = nx.read_edgelist(edgelist, delimiter=',')
    labels = pd.read_csv(labels, header=None)
    labels.columns = ['node', 'label']
    labels = labels.sort_values(by='node').reset_index(drop=True)
    return G, labels

def to_X(node_labels, embedder):
    """
    Takes a series of node names and returns matrix of embeddings
    """
    X = pd.DataFrame.from_records(
        node_labels.astype(str).apply(embedder.predict).values)
    return X

G, labels = make_blogcatalog()
y = labels.label

# pick up n_clusters automatically
n_clusters = labels.label.nunique()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    labels.node, labels.label, test_size=0.10, 
    random_state=33)

In [4]:
for WALKLEN in [80]: # l in paper
 for EPOCH in [10]: # r in paper
  for N_WEIGHT in [1.]:
   for R_WEIGHT in [1.]:
    for WINDOW in [10]: # k in paper
     for EMBED_SIZE in [128]: # d in paper
      for NS_EXP in [0.75]: # default, not in paper
       for NEGATIVE in [5]: # default, not in paper
        start_t = time.time()
        embedder = graph2vec.Node2Vec(
            walklen=WALKLEN,
            epochs=EPOCH,
            return_weight=R_WEIGHT,
            neighbor_weight=N_WEIGHT,
            w2vparams={'window': WINDOW,
                       'size': EMBED_SIZE, 
                       'negative': NEGATIVE, 
                       'iter': 5,
                       'ns_exponent': NS_EXP,
                       'batch_words': 128}
        )
        embedder.fit(G)
        print(f"Fit Embedder: {time.time() - start_t:.2f}")
        logit = linear_model.LogisticRegressionCV(cv=10, scoring='f1_macro',
                                                  max_iter=3000,
                                                  solver='lbfgs',
                                                  multi_class='ovr')
        X_full = to_X(labels.node, embedder=embedder)
        scaler = StandardScaler().fit(X_full)
        logit.fit(scaler.transform(to_X(X_train, embedder=embedder)), y_train)
        score = logit.scores_[1].mean(axis=0).max()
        print(f"Trained: {time.time() - start_t:.2f}")
        print(f'best CV score: {score :.4f}')
        test_score = metrics.f1_score(
            y_true=y_test,
            y_pred=logit.predict(scaler.transform(to_X(X_test, embedder=embedder))),
            average='macro'
        )
        print(f"test score: {test_score :.4f}")

Making walks... Done, T=3.71
Mapping Walk Names... Done, T=4.43
Training W2V... Done, T=83.14
Fit Embedder: 92.23




Trained: 277.58
best CV score: 0.4971
test score: 0.1654


In [None]:
for N_NEIGHBORS in [5, 15]:
 for MIN_DIST in [0.001, 0.01]:
  for EMBED_SIZE in [64, 128]:
    start_t = time.time()
    embedder = graph2vec.SKLearnEmbedder(
        umap.UMAP,
        n_neighbors=N_NEIGHBORS,
        min_dist=MIN_DIST,
        metric='euclidean',
        n_components=EMBED_SIZE,
    )
    embedder.fit(G)
    print(f"Fit Embedder: {time.time() - start_t:.2f}")
    logit = linear_model.LogisticRegressionCV(cv=10, scoring='f1_macro',
                                              max_iter=3000,
                                              solver='lbfgs',
                                              multi_class='ovr')
    X_full = to_X(labels.node, embedder=embedder)
    scaler = StandardScaler().fit(X_full)
    logit.fit(scaler.transform(to_X(X_train, embedder=embedder)), y_train)
    score = logit.scores_[1].mean(axis=0).max()
    print(f"Trained: {time.time() - start_t:.2f}")
    print(f'best CV score: {score :.4f}')
    test_score = metrics.f1_score(
        y_true=y_test,
        y_pred=logit.predict(scaler.transform(to_X(X_test, embedder=embedder))),
        average='macro'
    )
    print(f"test score: {test_score :.4f}")
    print(f"Neighbors: {N_NEIGHBORS}, dist: {MIN_DIST}, embed: {EMBED_SIZE}"
          "\n------------\n\n")

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../anaconda3/lib/python3.7/site-packages/umap/utils.py", line 409:[0m
[1m@numba.njit(parallel=True)
[1mdef build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
[0m[1m^[0m[0m
[0m[0m
  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../anaconda3/lib/python3.7/site-packages/umap/sparse.py", line 176:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.

Fit Embedder: 137.29




Trained: 177.28
best CV score: 0.4971
test score: 0.0338
Neighbors: 5, dist: 0.001, embed: 16
------------




The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../anaconda3/lib/python3.7/site-packages/umap/sparse.py", line 176:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))
  n_components


Fit Embedder: 128.48


