In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import networkx as nx
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import adjusted_rand_score

import utils

%config InlineBackend.figure_format = 'retina'

default_cluster_eps = 0.021
default_cluster_samples = 10

lower_cluster_eps = 0.012
lower_min_samples = 10

In [2]:
# Read in graph stats including cc, gamma and degree
graph_stats = pd.read_csv("gammas.csv", index_col=0, header=0)

# Load TSNE embeddings 
s2v_2  = pd.read_csv('../struc2vec/emb/bitcoin-2dims.emb', skiprows=[0], sep=" ", names=['ids', 'X', 'Y'])
s2v_10 = pd.read_csv('emb/2d/tsne_10dim.csv', header=None, names=['ids', 'X', 'Y'])
s2v_25 = pd.read_csv('emb/2d/tsne_25dim.csv', header=None, names=['ids', 'X', 'Y'])
s2v_50 = pd.read_csv('emb/2d/tsne_50dim.csv',header=None, names=['ids', 'X', 'Y'])
s2v_100 = pd.read_csv('emb/2d/tsne_100dim.csv', header=None, names=['ids', 'X', 'Y'])
s2v_300 = pd.read_csv('emb/2d/tsne_300dim.csv', header=None, names=['ids', 'X', 'Y'])

# Load node2vec embeddings
n2v_2 = pd.read_csv('../node2vec/emb/n2v_2dim_btc.emb', skiprows=[0], sep=" ", header=None, names=['ids', 'X', 'Y'])
n2v_50 = pd.read_csv('emb/2d/n2v_50_btc.csv', header=None, names=['ids', 'X', 'Y'])

# Load baseline
baseline_df = pd.read_csv('emb/2d/baseline_btc.csv', sep=' ', skiprows=[0], header=None, names=['ids', 'X', 'Y'])

# Read in original graph
edge_list = pd.read_csv('../struc2vec/graph/bitcoin-undirected.edgelist', header=0, sep=' ', names=['from', 'to'])
G = nx.from_pandas_edgelist(edge_list, 'from', 'to')

#cc_gamma_test = pd.read_csv('../struc2vec/emb/cc_gamma_test.emb', skiprows=[0], header=None, sep=' ', names=['ids','X', 'Y'])
#display(cc_gamma_test)
#utils.show_embedding_plot(cc_gamma_test)

In [51]:
#Plot TSNE cluster with cc as color
embeddings = []
for i, emb in enumerate([s2v_50, s2v_25]):
    emb_copy = emb.copy(deep=True).set_index("ids")
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_emb = scaler.fit_transform(emb_copy)
    emb_copy['X'] = scaled_emb[:, 0]
    emb_copy['Y'] = scaled_emb[:, 1]
        
    # Add clusters and graph data
    emb_copy = utils.enhance_with_clusterings(emb_copy, eps=lower_cluster_eps, min_samples=lower_min_samples)
    emb_copy = emb_copy.join(graph_stats)
    
    # Add means of neighbor degrees
    avg_neighbor_degrees = nx.neighbor_degree.average_neighbor_degree(G)
    mean_neighborhood_degrees = pd.DataFrame({'Neighbor Degrees': [avg_neighbor_degrees[node] for node in emb_copy.index.values]}, index=emb_copy.index.values)
    
    # Add means of neighbor gammas and ccs
    node_gammas = []
    node_ccs = []
    for node in tqdm(emb_copy.index.values):
        node_neighbors = nx.neighbors(G, node)
        gammas = []
        ccs = []
        for neighbor in node_neighbors:
            gammas.append(graph_stats.loc[neighbor, 'Gamma'])
            ccs.append(graph_stats.loc[neighbor, 'Cluster Coefficient'])
        node_gammas.append(np.mean(gammas))
        node_ccs.append(np.mean(ccs))
    mean_neighborhood_gamma = pd.DataFrame({'Neighbor Gamma': node_gammas}, index=emb_copy.index.values)
    mean_neighborhood_cc = pd.DataFrame({'Neighbor CC': node_ccs}, index=emb_copy.index.values)
    emb_copy = emb_copy.join([mean_neighborhood_degrees, mean_neighborhood_cc, mean_neighborhood_gamma])

    # Filter to show only certain clusters
    #emb_copy = emb_copy[emb_copy['Cluster'].apply(lambda cluster: cluster in [36, 37, 41, 2])]
    #emb_copy = emb_copy[emb_copy['Cluster'].apply(lambda cluster: cluster in [1, 40 ,39])]
    #emb_copy = emb_copy[emb_copy['Cluster'].apply(lambda cluster: cluster in [28])]
    
    # Filter to show only certain degrees
    #emb_copy = emb_copy[emb_copy['Node Degree'] <= 10]
    
    # For calculation of Rand index
    embeddings.append(emb_copy)

100%|██████████| 5881/5881 [00:01<00:00, 4642.17it/s]
100%|██████████| 5881/5881 [00:01<00:00, 4787.93it/s]


In [52]:
# Plot settings
cmap = 'gnuplot'
save_prefix = 'img/tsne_10dim'
save = False
print_plots = False

if print_plots:
    for emb_copy in embeddings:
        log_norm = utils.get_norm(num_colors=emb_copy['Node Degree'].max(), logarithmic=True, vmin=1.0)
        # Show clusters
        print("Clusters")
        #manual_annotate = {0: (0.3, 0.65), 1: (0.3, 0.47), 2: (0.66, 0.4), 33: (0.8, 0.15)}
        #utils.show_clusters(emb_copy, cluster_labels=False, cmap=cmap, save=save, save_prefix='{}_cluster.png'.format(save_prefix))

        # Show node degree
        #print("Node Degree")
        utils.show_embedding_plot(emb_copy, color_col='Node Degree', cmap=cmap, norm=log_norm, save=save, savefile='{}_degree.png'.format(save_prefix))
#
        ## Show neighbor degrees
        #print("Neighbor Degrees")
        #utils.show_embedding_plot(emb_copy, color_col='Neighbor Degrees', cmap=cmap, save=save, savefile='{}_neighbormean'.format(save_prefix))
#
        ## Show neighbor gammas
        #print("Neighbor Gamma")
        #utils.show_embedding_plot(emb_copy, color_col='Neighbor Gamma', cmap=cmap, save=save, savefile='{}_neighborgammas'.format(save_prefix))
#
        ## Show neighbor ccs
        #print("Neighbor CC")
        #utils.show_embedding_plot(emb_copy, color_col='Neighbor CC', cmap=cmap, save=save, savefile='{}_neighborcc'.format(save_prefix))
#
        ## Show cluster coefficient
        #print("Cluster Coefficient")
        #utils.show_embedding_plot(emb_copy, color_col='Cluster Coefficient', cmap=cmap, save=save, savefile='{}_cc.png'.format(save_prefix))
#
        ## Show gamma
        #print("Gamma")
        #utils.show_embedding_plot(emb_copy, color_col='Gamma', cmap=cmap, save=save, savefile='{}_gamma.png'.format(save_prefix))

In [53]:
if len(embeddings) >= 2:
    baseline_clusters = embeddings[0][embeddings[0]['Node Degree'] > 2].sort_index()['Cluster'].values
    other_clusters = embeddings[1][embeddings[1]['Node Degree'] > 2].sort_index()['Cluster'].values
    adj_rand = adjusted_rand_score(baseline_clusters, other_clusters)

    print("Adjusted Rand index: {}".format(adj_rand))

Adjusted Rand index: 0.7894814181770631


In [None]:
#clusters = [utils.get_cluster_members_index(clustered_embeddings, i) for i in [36, 37, 41, 2]]
#cluster_neighbors = []
#for cluster in clusters:
#    neighbors = []
#    for node in cluster:
#        source, dest = list(G.edges(node))[0]
#        neighbor = source if source != node else dest
#        neighbors.append(neighbor)
#    cluster_neighbors.append(neighbors)
#
#for i, neighbors in enumerate(cluster_neighbors):
#    degrees, ccs, gammas = [], [], []
#    for node in neighbors:
#        egograph = nx.ego_graph(G, node)
#        degrees.append(utils.get_degree(egograph, node))
#        ccs.append(nx.clustering(egograph, node))
#        gammas.append(utils.gamma(egograph, node))
#    print("Cluster {}".format(i))
#    print("Mean degree {}".format(np.mean(degrees)))
#    print("Mean cc {}".format(np.mean(ccs)))
#    print("Mean gamma {}".format(np.mean(gammas)))
#    
#    print("Median degree {}".format(np.median(degrees)))
#    print("Median cc {}".format(np.median(ccs)))
#    print("Median gamma {}".format(np.median(gammas)))

In [None]:
clusters = [utils.get_cluster_members_index(emb_copy, i) for i in [28]]
min_len = min(list(map(lambda cluster: len(cluster), clusters)))
min_len = 10
#clusters = list(map(lambda cluster: cluster[min_len:min_len+10], clusters))


for nodes in clusters:
    print(nodes)
    neighbors = []
    for node in nodes:
        #print(list(G.edges(node)))
        source, dest = list(G.edges(node))[0]
        neighbor = source if source != node else dest
        neighbors.append(neighbor)
    print(set(neighbors))
    print(len(set(neighbors)))
    print(len(neighbors))
    print()

i = 0
for nodes in zip(*clusters):
    utils.create_egograph_comparison(G, nodes, i, radius=2, save=True, savepath='img/egograph_comparison_zoomed')
    break
    i += 1