In [3]:
import io
import gzip
import networkx as nx
import numpy as np
import os
import pandas as pd
import requests
import scipy as sc
from scipy import sparse
import scipy.io
from sklearn import cluster, manifold, linear_model, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sympy.sandbox.tests.test_indexed_integrals
import sys
import time
import umap
# Silence perf warning
import warnings

sys.path.append(os.path.realpath('..'))

import nodevectors as graph2vec
import csrgraph
from csrgraph import CSRGraph
import graph_eval

warnings.simplefilter("ignore")

In [4]:
OUT_FILE = 'weighed_edges.csv'

# G, labels = graph_eval.make_blogcatalog(dedupe=True)
# G, labels = graph_eval.make_cluster_graph(n_nodes=320, n_clusters=18, connections=150, drop_pct=0.2)
G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
# G, labels = graph_eval.make_snap()

y = labels.label
n_clusters = y.nunique()

X_train, X_test, y_train, y_test = train_test_split(
    labels.node, labels.label, test_size=0.10, 
    random_state=33)

In [5]:
for MAX_LOSS in [5.]:
 for LEARNING_RATE in [0.01]:
  for EMBED_SIZE in [1, 2, 4, 8, 16, 64, 128, 256]:
    try:
      embedder = graph2vec.Glove(
          n_components=EMBED_SIZE,
          tol="auto",
          tol_samples=50,
          max_epoch=5_000,
          learning_rate=LEARNING_RATE, 
          max_loss=MAX_LOSS,
          threads=0,
          verbose=True
      )
      res = graph_eval.evaluate_embedding(
          embedder, G, labels, n_clusters,
          X_train, X_test, y_train, y_test)
      print(res)
      res = pd.DataFrame([pd.Series(res)])
      if os.path.isfile(OUT_FILE):
        res.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')
      else:
          res.to_csv(OUT_FILE, float_format='%.3f')
    except:
      continue

Loss: 0.1509	:   2%|▏         | 78/5000 [00:04<05:05, 16.13it/s]  


Converged! Losses : [0.15391595832898125, 0.15364663684769495, 0.15343936337337644, 0.15326127941205625, 0.1531446127784525, 0.15298920598347018, 0.1527203970289132, 0.15257500223890472, 0.15239553745830178, 0.15229825950217668, 0.15219854047512926, 0.15173250042860398, 0.15172220139566542, 0.15162469299280687, 0.15154228154843294, 0.1514490355766228, 0.15146159610103183, 0.15118833804765355, 0.15107605630551735, 0.1508620079283213]
{'method': 'Glove', 'traintime': 7.131664276123047, 'F1': 0.5551904193151465, 'F1_test': 0.09120339045483596, 'MI': 0.06545311026065181, 'RAND': 0.0136059013135021, 'F-M': 0.0934741913771054, 'params': {'learning_rate': 0.01, 'max_epoch': 5000, 'max_loss': 5.0, 'n_components': 16, 'threads': 0, 'tol': 'auto', 'tol_samples': 20, 'verbose': True}}


Loss: 0.1052	:   3%|▎         | 142/5000 [00:06<03:58, 20.37it/s]


Converged! Losses : [0.10694350106728524, 0.10701467207938574, 0.10678887420131183, 0.10685289805729727, 0.10695476756045105, 0.10655253283763852, 0.10660989421006974, 0.10672569554347186, 0.10599066767457122, 0.10596935946652708, 0.10578352914687005, 0.1058005224638035, 0.10566149283362569, 0.1054318709002933, 0.10525685227961559, 0.1054101452387799, 0.10533233637760549, 0.10540745642518531, 0.10517232854945723, 0.10515497496403746]
{'method': 'Glove', 'traintime': 9.485701084136963, 'F1': 0.6168379244679179, 'F1_test': 0.15055132302125013, 'MI': 0.08217034920717539, 'RAND': 0.025809705823553575, 'F-M': 0.09865787644111992, 'params': {'learning_rate': 0.01, 'max_epoch': 5000, 'max_loss': 5.0, 'n_components': 64, 'threads': 0, 'tol': 'auto', 'tol_samples': 20, 'verbose': True}}


Loss: 0.0660	:   4%|▍         | 208/5000 [00:21<08:06,  9.86it/s]


Converged! Losses : [0.06704955272499245, 0.066966796641209, 0.06698303970873072, 0.06680212792142846, 0.06697731394749334, 0.06687521351717843, 0.06675485145433893, 0.06655856418907036, 0.06654258789101845, 0.06658007900190722, 0.06637646250006243, 0.06642054713192493, 0.06623648682449566, 0.06617869792418651, 0.06615147377893746, 0.0662957745809613, 0.06630445250771033, 0.06605848191525214, 0.06597171414237035, 0.06595576974998951]
{'method': 'Glove', 'traintime': 23.27071189880371, 'F1': 0.6371087539077456, 'F1_test': 0.1708726878697015, 'MI': 0.08274997702185148, 'RAND': 0.03184838171826299, 'F-M': 0.10376544522609653, 'params': {'learning_rate': 0.01, 'max_epoch': 5000, 'max_loss': 5.0, 'n_components': 128, 'threads': 0, 'tol': 'auto', 'tol_samples': 20, 'verbose': True}}


Loss: 0.0428	:   4%|▍         | 214/5000 [00:49<18:23,  4.34it/s]

Converged! Losses : [0.043616562858115805, 0.04331158963628281, 0.043456713448479546, 0.043504712986878316, 0.04342055663647084, 0.04355026024845251, 0.043305297489409124, 0.04354248794051058, 0.043106554945595496, 0.04308461050955013, 0.04311939566592181, 0.04312586597974209, 0.043149600170322974, 0.043180159882454223, 0.04281075265781635, 0.04280992766915706, 0.04282458066761011, 0.042810681865745846, 0.04281489835059099, 0.04277660072916536]





{'method': 'Glove', 'traintime': 51.584237813949585, 'F1': 0.6613060055533676, 'F1_test': 0.1726232428655665, 'MI': 0.08091455034313026, 'RAND': 0.025202729222595952, 'F-M': 0.1002814195837408, 'params': {'learning_rate': 0.01, 'max_epoch': 5000, 'max_loss': 5.0, 'n_components': 256, 'threads': 0, 'tol': 'auto', 'tol_samples': 20, 'verbose': True}}


In [16]:
for MIN_DIST in [0.1, 0.01]:
 for N_NEIGHBORS in [15, 5]:
  for METRIC in ['euclidean', 'cosine']:
    for EMBED_SIZE in [1, 2, 4, 8, 16, 64, 128, 256]:
        embedder = graph2vec.SKLearnEmbedder(
            umap.UMAP,
            n_neighbors=N_NEIGHBORS,
            min_dist=MIN_DIST,
            metric='euclidean',
            n_components=EMBED_SIZE,
        )
        res = graph_eval.evaluate_embedding(
            embedder, G, labels, n_clusters,
            X_train, X_test, y_train, y_test)
        print(res)
        res = pd.DataFrame([pd.Series(res)])
        res.to_csv(OUT_FILE, mode='a', header=False)

{'method': 'SKLearnEmbedder', 'params': {'embedder__a': None, 'embedder__angular_rp_forest': False, 'embedder__b': None, 'embedder__init': 'spectral', 'embedder__learning_rate': 1.0, 'embedder__local_connectivity': 1.0, 'embedder__metric': 'euclidean', 'embedder__metric_kwds': None, 'embedder__min_dist': 0.1, 'embedder__n_components': 64, 'embedder__n_epochs': None, 'embedder__n_neighbors': 15, 'embedder__negative_sample_rate': 5, 'embedder__random_state': None, 'embedder__repulsion_strength': 1.0, 'embedder__set_op_mix_ratio': 1.0, 'embedder__spread': 1.0, 'embedder__target_metric': 'categorical', 'embedder__target_metric_kwds': None, 'embedder__target_n_neighbors': -1, 'embedder__target_weight': 0.5, 'embedder__transform_queue_size': 4.0, 'embedder__transform_seed': 42, 'embedder__verbose': False, 'embedder': UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
     metric_kwds=None, min_dist=0.1, n_compone

KeyboardInterrupt: 

In [8]:
for WALKLEN in [20, 40]: # l in paper
 for EPOCH in [20]: # r in paper
  for N_WEIGHT in [0.3, 1., 3.]:
   for R_WEIGHT in [0.3, 1., 3.]:
    for WINDOW in [10]: # k in paper
     for EMBED_SIZE in [1, 2, 4, 8, 16, 64, 128, 256]: # d in paper
      for NS_EXP in [0.75]: # default, not in paper
       for NEGATIVE in [5]: # default, not in paper
        embedder = graph2vec.Node2Vec(
            walklen=WALKLEN,
            epochs=EPOCH,
            return_weight=R_WEIGHT,
            neighbor_weight=N_WEIGHT,
            n_components=EMBED_SIZE,
            w2vparams={'window': WINDOW,
                       'negative': NEGATIVE, 
                       'iter': 5,
                       'ns_exponent': NS_EXP,
                       'batch_words': 128}
        )
        res = graph_eval.evaluate_embedding(
            embedder, G, labels, n_clusters,
            X_train, X_test, y_train, y_test)
        res = pd.DataFrame([pd.Series(res)])
        res.to_csv(OUT_FILE, mode='a', header=False, float_format='%.3f')

Making walks...Done, T=3.93
Mapping Walk Names... Done, T=0.12
Training W2V...Done, T=0.42
{'method': 'Node2Vec', 'params': {'epochs': 10, 'keep_walks': False, 'n_components': None, 'neighbor_weight': 0.3, 'return_weight': 0.3, 'threads': 8, 'w2vparams': {'window': 10, 'negative': 5, 'iter': 5, 'ns_exponent': 0.75, 'batch_words': 128, 'workers': 8}, 'walklen': 80}, 'traintime': 4.47423791885376, 'F1': 1.0, 'F1_test': 1.0, 'MI': 1.0, 'RAND': 1.0, 'F-M': 1.0}
Making walks...Done, T=0.26
Mapping Walk Names... Done, T=0.13
Training W2V...Done, T=0.40
{'method': 'Node2Vec', 'params': {'epochs': 10, 'keep_walks': False, 'n_components': None, 'neighbor_weight': 0.3, 'return_weight': 1.0, 'threads': 8, 'w2vparams': {'window': 10, 'negative': 5, 'iter': 5, 'ns_exponent': 0.75, 'batch_words': 128, 'workers': 8}, 'walklen': 80}, 'traintime': 0.7869248390197754, 'F1': 1.0, 'F1_test': 1.0, 'MI': 1.0, 'RAND': 1.0, 'F-M': 1.0}
Making walks...Done, T=0.26
Mapping Walk Names... Done, T=0.11
Training W2