In [98]:
import pandas as pd
import numpy as np
import cveig
import networkx as nx
from scipy.sparse import dok_array

In [99]:
emails = pd.read_table('data/email-Eu-core.txt', names=['source', 'target'], sep=' ')

In [100]:
emails

Unnamed: 0,source,target
0,0,1
1,2,3
2,2,4
3,5,6
4,5,7
...,...,...
25566,420,143
25567,174,859
25568,440,460
25569,52,786


In [101]:
departments = pd.read_table('data/email-Eu-core-department-labels.txt', names=['node', 'department'], sep=' ')

In [102]:
departments

Unnamed: 0,node,department
0,0,1
1,1,1
2,2,21
3,3,21
4,4,21
...,...,...
1000,1000,4
1001,1001,21
1002,1002,1
1003,1003,6


In [103]:
dept_sizes = departments.groupby('department').size()

In [104]:
small_depts = dept_sizes[dept_sizes < 10]

In [105]:
nodes_to_ignore = departments[departments.department.isin(small_depts.keys())]

In [106]:
nodes_to_ignore

Unnamed: 0,node,department
5,5,25
6,6,25
13,13,26
64,64,25
80,80,29
...,...,...
941,941,41
966,966,29
971,971,32
980,980,12


In [107]:
emails_filtered = emails[~emails.source.isin(nodes_to_ignore.node) & ~emails.target.isin(nodes_to_ignore.node)]

In [108]:
emails_filtered.shape

(22028, 2)

In [109]:
email_graph = nx.from_pandas_edgelist(emails_filtered, create_using=nx.DiGraph)

In [110]:
adjacency_matrix = nx.to_numpy_matrix(email_graph)

In [111]:
adjacency_matrix.shape

(929, 929)

In [112]:
ground_truth = 28

In [113]:
small_depts.keys()

Int64Index([12, 18, 24, 25, 26, 28, 29, 30, 31, 32, 33, 39, 40, 41], dtype='int64', name='department')

In [117]:
sparse_matrix = dok_array(adjacency_matrix.astype(np.int32))
nb = cveig.non_backtracking(sparse_matrix, 35)
bh = cveig.bethe_hessian(sparse_matrix)
cv = cveig.eig_cv(sparse_matrix, 35, 0.5)
cv_mod = cveig.eig_cv_mod(sparse_matrix, 35, 0.5)

In [118]:
print(nb, bh, cv, cv_mod)

6 17 6 1


In [121]:
cv = cveig.eig_cv(sparse_matrix, 50, 0.05, 0.01, folds=10)

In [123]:
cv

5