# Count $<i,j>$ pairs as inputs to learn Node Embedding

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from scipy.io import loadmat
from random import choices
import copy
from scipy.sparse import issparse
from statistics import mean,stdev

## Find percolation threshold. Avarage size of largest cluster $<s_{max}>$ v.s. $\beta$

### 1. Transform edgelist to Graph

In [2]:
def load_matfile(file_, variable_name="network", unDirected=True, unWeighted = True):
    mat_variables = loadmat(file_)
    mat_matrix = mat_variables[variable_name]
    if issparse(mat_matrix):
        if unDirected:
            G = nx.Graph()
        else:
            G = nx.DiGraph()
        '''
        from_nodes: cx.row
        to_nodes: cx.col
        link weights: cx.data
        '''
        cx = mat_matrix.tocoo()
        if unWeighted:
            edge_list = np.array([cx.row, cx.col]).T
            G.add_edges_from(edge_list)
        else:
            edge_list = np.array([cx.row, cx.col, cx.data]).T
            G.add_weighted_edges_from(edge_list)
    return G

In [3]:
def percolate(G, beta, unWeighted=True):
    '''
    Here we don't consider Directed/Weighted graph 
    
    '''
    edges_percolated = []
    G_percolated = nx.Graph()
    assert(G_percolated.number_of_nodes() == 0)
    numPercolatedEdges = int(beta*G.number_of_edges())
    G_percolated.add_nodes_from(list(G.nodes))
    if unWeighted:
        edges_percolated = choices(list(G.edges), k=numPercolatedEdges)
        assert(len(edges_percolated) ==  numPercolatedEdges)
        G_percolated.add_edges_from(edges_percolated)
#     else:
#         bond percolation probability depends on the edge's weight
    return G_percolated

In [4]:
def count_clusters(G, beta, numIter, threshold):
    Gc_sizes = []
    Clusters_sizes = []
    for i in range(numIter):
        G_percolated = percolate(G, beta)
        clusters = []
        # nx.connected_component_subgraphs is a iterator
        for c in list(nx.connected_component_subgraphs(G_percolated)):
            if c.number_of_nodes() > threshold:
                size = c.number_of_nodes()
                clusters.append(size)     
                Clusters_sizes.append(size)
                
        Gc_sizes.append(max(clusters))
        
    return Gc_sizes, Clusters_sizes

In [5]:
def do_ClustersStatistics(G, betas, numIter, threshold, logging):
    GcSizes_betas = []
    ClustersSizes_betas = []
    for beta in betas:
        Gc_sizes, Clusters_sizes = count_clusters(G = G, 
                                                  beta = beta,
                                                  numIter = numIter,
                                                  threshold=threshold)
        GcSizes_betas.append(Gc_sizes)
        ClustersSizes_betas.append(Clusters_sizes)
        if logging:
            print("beta = {0:.4f} calculated".format(beta))
    return GcSizes_betas,ClustersSizes_betas

In [6]:
def plot_averGcSizes(GcSizes_betas, betas):
    plt.figure(1)
    Gc_data = np.array([[mean(sizes),stdev(sizes)] for sizes in GcSizes_betas])
    # y is meanOfSizes, yerr is stdevOfSizes
    plt.errorbar(x = betas, y = Gc_data[:,0], yerr = Gc_data[:,1], fmt='o')
    plt.xlabel('beta')
    plt.ylabel('Gc_size')

def plot_GcSizes(GcSizes_betas, betas):
    plt.figure(2)
    for i,beta in enumerate(betas):
        plt.plot([beta]*len(GcSizes_betas[i]),GcSizes_betas[i], ',')

#             plt.scatter([beta]*len(GcSizes_betas[i]),GcSizes_betas[i], s = 1,facecolors = 'none', edgecolors='r')
    plt.xlabel('beta')
    plt.ylabel('Gc_size')
    plt.xlim(min(betas)-0.0001,max(betas)+0.0001)
def plot_averClustersSizes(ClustersSizes_betas, betas):
    plt.figure(3)
    CS_data = np.array([[mean(sizes),stdev(sizes)] for sizes in ClustersSizes_betas])
    # y is meanOfSizes, yerr is stdevOfSizes
    plt.errorbar(x = betas, y = CS_data[:,0], yerr = CS_data[:,1], fmt='o')

def plot_ClustersSizes(ClustersSizes_betas, betas):
    plt.figure(4)
    for i,beta in enumerate(betas):
            plt.plot([beta]*len(ClustersSizes_betas[i]),ClustersSizes_betas[i],',')
#         plt.scatter([beta]*len(ClustersSizes_betas[i]),ClustersSizes_betas[i],s = 10, facecolors = 'none', edgecolors='r')
    plt.xlim(min(betas)-0.0001,max(betas)+0.0001)
#     plt.ylim([0,100])

In [7]:
def cal_numClusters(G, betas, thresholds, numIter):
    aver_numClusters = np.zeros([len(betas),len(thresholds)])
    for n in range(numIter):
        numClusters = []
        for beta in betas:  
            G_percolated = percolate(G, beta)
            Clusters_sizes = []
            numClusters_t = []
            # nx.connected_component_subgraphs is a iterator
            for t in thresholds:
                clusters_sizes = []
                for c in list(nx.connected_component_subgraphs(G_percolated)):
                    size = c.number_of_nodes()
                    if size > t:
                        clusters_sizes.append(size)     
                numClusters_t.append((sum(clusters_sizes)-max(clusters_sizes)))
            numClusters.append(numClusters_t)
        print("num_iter = ", n)
        aver_numClusters += numClusters
    return aver_numClusters/numIter

In [8]:
def plot_numClusters(numClusters):
    data_numClusters = np.array(numClusters).T
    for i,t in enumerate(thresholds):
        plt.figure(i+5)
        plt.plot(betas, data_numClusters[i,:])
    #         plt.scatter([beta]*len(ClustersSizes_betas[i]),ClustersSizes_betas[i],s = 10, facecolors = 'none', edgecolors='r')
        plt.xlim(min(betas)-0.0001,max(betas)+0.0001)
        plt.ylim(0,1300)
        plt.title("threshold = "+str(t))

In [None]:
if __name__ == "__main__":
    # "blogcatalog.mat" is a undirected, unweighted graph
    G_blogcatalog = load_matfile(file_ = "blogcatalog.mat")
    assert(G_blogcatalog.number_of_nodes() == 10312)
    assert(G_blogcatalog.number_of_edges() == 333983)
    
    # percolate
    betas = np.linspace(0.002,0.01,21)
    GcSizes_betas,ClustersSizes_betas = do_ClustersStatistics(G = G_blogcatalog, 
                                                              betas = betas, 
                                                              numIter = 20, 
                                                              threshold = 10, 
                                                              logging=False)
    plot_averGcSizes(GcSizes_betas, betas)
    plot_GcSizes(GcSizes_betas, betas)
    plot_averClustersSizes(ClustersSizes_betas, betas)
    plot_ClustersSizes=(ClustersSizes_betas, betas)

In [None]:
numClusters = cal_numClusters(G=G_blogcatalog, 
                              betas = np.linspace(0.003,0.008, 21), 
                              thresholds = [1, 10, 20], 
                              numIter = 2)
plot_numClusters(numClusters)