In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import json
import torch
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, f1_score
from tqdm.notebook import tqdm
import os

from train_ import train

Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'wurlitzer', 'infomap', 'leidenalg', 'bayanpy'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'ASLPAw', 'pyclustering'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'wurlitzer', 'infomap', 'leidenalg'}


In [2]:
torch.cuda.is_available()

True

In [3]:
def load_datasets():
    ROOT = '../data/gml_graphs'
    EMPTY_FAMILY = ''

    paths = []
    for fname in sorted(os.listdir(ROOT)):
        path = f'{ROOT}/{fname}'
        if fname.endswith('.gml'):
            paths.append((EMPTY_FAMILY, path))
        elif os.path.isdir(path):  # this is a family
            family = fname
            paths_family = [(family, f'{path}/{fname}') for fname in sorted(os.listdir(path)) if fname.endswith('.gml')]
            paths.extend(paths_family)

    graphs = []
    for family, path in tqdm(paths):
        G = nx.read_gml(path)

        # reset node ids to 1, 2, 3, ...
        G = nx.convert_node_labels_to_integers(G, first_label=1, ordering="default", label_attribute=None)
        if 'gt' not in G.nodes[1]:
            # print('No gt attribute in the first node')
            continue

        # reset gt to according to original gt but with numerical values
        cluster_labels = {}
        numerical_label = 0
        # check if first node doesn't have the 'gt' attribute
        for node, data in G.nodes(data=True):
            gt_label = data['gt']
            if gt_label not in cluster_labels:
                cluster_labels[gt_label] = numerical_label
                numerical_label += 1

        # Assign numerical labels to nodes
        for node, data in G.nodes(data=True):
            gt_label = data['gt']
            numerical_label = cluster_labels[gt_label]
            data['gt'] = numerical_label
        
        attrs = nx.get_node_attributes(G, 'gt')
        print(path.split('/')[-1][:-4])
        graphs.append({'name':path.split('/')[-1][:-4], 'G': G, 'y': attrs, 'n_clusters': len(set(attrs)), 'path':path})
    return graphs

In [4]:
datasets = {}
with open('output/example/args.json') as f:
    args = json.load(f)
tests_df = pd.DataFrame(columns=['graph', 'y_true', 'y_result','nmi','args'])
datasets = load_datasets()

  0%|          | 0/59 [00:00<?, ?it/s]

as
cora
cora_full
Artificial_Intelligence
Artificial_Intelligence__Machine_Learning
Data_Structures__Algorithms_and_Theory
Databases
Encryption_and_Compression
Hardware_and_Architecture
Human_Computer_Interaction
Information_Retrieval
Networking
Operating_Systems
Programming
dolphins
eu-core
eurosis
football
karate
news_2cl_1
news_2cl_2
news_2cl_3
news_3cl_1
news_3cl_2
news_3cl_3
news_5cl_1
news_5cl_2
news_5cl_3
news_2cl1_0.1
news_2cl2_0.1
news_2cl3_0.1
news_3cl1_0.1
news_3cl2_0.1
news_3cl3_0.1
news_5cl1_0.1
news_5cl2_0.1
news_5cl3_0.1
polblogs
polbooks
sp_school_day_1
sp_school_day_2


Football dataset

In [70]:
# G = nx.read_gml('../data/football.gml')
# G = nx.convert_node_labels_to_integers(G, first_label=1, ordering="default", label_attribute=None)
# # Get the ground truth for communities
# y = [G.nodes[node]['value'] for node in G.nodes()]
# datasets['football'] = {'name':'football', 'G': G, 'y': y, 'n_clusters': len(np.unique(y))}

polbooks dataset

In [71]:
# G = nx.read_gml('../data/polbooks.gml')
# G = nx.convert_node_labels_to_integers(G, first_label=1, ordering="default", label_attribute=None)
# # Get the ground truth for communities
# y = [G.nodes[node]['value'] for node in G.nodes()]
# datasets['football'] = {'name':'polbooks', 'G': G, 'y': y, 'n_clusters': len(np.unique(y))}

dolphins dataset

In [None]:
# G = nx.read_gml('../data/dolphins.gml')
# G = nx.convert_node_labels_to_integers(G, first_label=1, ordering="default", label_attribute=None)
# # Get the ground truth for communities
# y = [G.nodes[node]['label'] for node in G.nodes()]
# datasets['dolphins'] = {'name':'dolphins', 'G': G, 'y': y, 'n_clusters': len(np.unique(y))}

karate dataset

In [26]:
# A_nx = nx.karate_club_graph()
# mapping = {node: node + 1 for node in A_nx.nodes()}
# A_nx = nx.relabel_nodes(A_nx, mapping)
# datasets['karate'] = {'name':'karate', 'G': A_nx, 'y': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'n_clusters': 2}

In [None]:
# # Nodes should start from number '1'
# # embedding_path = train(lrc=True, A_nx=A_nx, dictionary=args)

# embedding_path = 'output\example\embed_afterLSTM_50.txt'
# # Loading embeddings
# with open(embedding_path) as f:
#     embeddings = f.readlines()
#     embeddings = [list(map(float, x.strip().split()[1:])) for x in embeddings]

# # apply k-means clustering on embeddings
# kmeans = KMeans(n_clusters=2, random_state=0).fit(embeddings)
# # nmi = normalized_mutual_info_score(y_true, kmeans.labels_)
# print(kmeans.labels_)
# # print(nmi)

In [5]:
# template for testing a dataset (a graph after loading) and storing the result(appending) the nmi in a dataframe and saving it
def test(dataset, args, lrc=True):
    embedding_path = train(lrc=lrc, A_nx=dataset['G'], dictionary=args)
    # embedding_path = 'output\example\embed_afterLSTM_50.txt'
    with open(embedding_path) as f:
        embeddings = f.readlines()
        embeddings = [list(map(float, x.strip().split()[1:])) for x in embeddings]
    # Store the results of embedding to a file with its name
    embedding_file = 'output/embeddings/' + dataset['name'] + f'_embedding_{args["nhidden"]}.txt'
    with open(embedding_file, 'w') as f:
        for embedding in embeddings:
            embedding_str = ' '.join(str(value) for value in embedding)
            f.write(embedding_str + '\n')

    # Clustering
    kmeans = KMeans(n_clusters=dataset['n_clusters'], random_state=0).fit(embeddings)

    nmi = normalized_mutual_info_score(dataset['y'], kmeans.labels_)
    # Other metrics
    
    return pd.DataFrame({'graph': str(dataset['G'].edges()), 'y_true': str(dataset['y']), 'y_result': str(list(kmeans.labels_)), 'nmi': str(nmi), 'args': args}, index=[0])

In [6]:
datasets

[{'name': 'as',
  'G': <networkx.classes.graph.Graph at 0x207e8767970>,
  'y': {1: 0,
   2: 0,
   3: 1,
   4: 2,
   5: 0,
   6: 1,
   7: 0,
   8: 0,
   9: 0,
   10: 0,
   11: 0,
   12: 0,
   13: 1,
   14: 0,
   15: 1,
   16: 1,
   17: 0,
   18: 3,
   19: 0,
   20: 0,
   21: 0,
   22: 0,
   23: 0,
   24: 1,
   25: 1,
   26: 1,
   27: 1,
   28: 1,
   29: 4,
   30: 0,
   31: 1,
   32: 0,
   33: 5,
   34: 1,
   35: 1,
   36: 6,
   37: 0,
   38: 4,
   39: 7,
   40: 8,
   41: 1,
   42: 2,
   43: 1,
   44: 9,
   45: 1,
   46: 1,
   47: 1,
   48: 10,
   49: 11,
   50: 6,
   51: 12,
   52: 13,
   53: 0,
   54: 0,
   55: 0,
   56: 0,
   57: 0,
   58: 0,
   59: 1,
   60: 0,
   61: 0,
   62: 0,
   63: 1,
   64: 0,
   65: 1,
   66: 1,
   67: 0,
   68: 0,
   69: 0,
   70: 1,
   71: 1,
   72: 1,
   73: 0,
   74: 0,
   75: 0,
   76: 0,
   77: 0,
   78: 0,
   79: 0,
   80: 14,
   81: 0,
   82: 1,
   83: 0,
   84: 1,
   85: 0,
   86: 11,
   87: 0,
   88: 0,
   89: 1,
   90: 0,
   91: 0,
   92: 0,
   93:

In [10]:
for dataset in datasets:
    print('testing on ', dataset['name'])
    if dataset['name'] == 'karate':
        args['nhidden'] = 4
    else:
        args['nhidden'] = 10
    row = test(dataset, args, lrc=True)
    tests_df = pd.concat([tests_df, row])
    # update the file of output/tests.csv
    tests_df.to_csv('output/tests.csv', index=False)
# tests_df.to_csv('output/tests.csv', mode='a', index=False)

testing on  as
{'data_path': '../data/karate.adjlist', 'outf': 'example', 'maxlen': 100, 'nhidden': 10, 'emsize': 30, 'nlayers': 1, 'noise_radius': 0.2, 'noise_anneal': 0.995, 'hidden_init': False, 'arch_g': '300-300', 'arch_d': '300-300', 'z_size': 100, 'temp': 1, 'enc_grad_norm': True, 'gan_toenc': -0.01, 'dropout': 0.0, 'epochs': 50, 'walk_length': 20, 'numWalks_per_node': 30, 'batch_size': 64, 'niters_ae': 1, 'niters_gan_d': 5, 'niters_gan_g': 1, 'niters_gan_schedule': '2-4-6-10-20-30-40', 'min_epochs': 6, 'no_earlystopping': False, 'lr_ae': 1, 'lr_gan_g': 5e-05, 'lr_gan_d': 1e-05, 'beta1': 0.9, 'clip': 1, 'gan_clamp': 0.01, 'sample': False, 'log_interval': 200, 'seed': 1111, 'cuda': 'store_true', 'ntokens': 34}
e:\CommunityDetectionModel\LRC-Q-NetRA\src
cuda is True
store_true


In [64]:
tests_df

Unnamed: 0,graph,y_true,y_result,nmi
0,"[(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7...","[7, 0, 2, 3, 7, 3, 2, 8, 8, 7, 3, 10, 6, 2, 6,...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, ...",0.3630770937606055
0,"[(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, ...",0.3630770937606055
0,"[(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, ...","[1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, ...",0.8364981174679549
