Regulome Clustering with SKNN and Infomap

In [1]:
import pandas as pd
import igraph as ig
import seaborn as sns
from math import log10
from statistics import mean, median
import os

path = "./files/sknn_networks/"

In [2]:
sknn = pd.read_csv(filepath_or_buffer=path+"FC6.0_H.sapiens_REGULOME-S10NN.gz", sep=' ', header=None, names=['ProteinAid', 'ProteinBid', 'weight'], compression=None)
sknn = sknn[sknn['weight'] > 0]
sknn_graph = ig.Graph.DataFrame(sknn, directed=True, use_vids=False)

In [12]:
def sknn_infomap_info(file, filter_0=True, weighted=True, directed=False, n_trials=10, return_clusters=False):
    cluster_properties = dict()
    
    sknn = pd.read_csv(filepath_or_buffer=path+file, sep=' ', header=None, names=['ProteinAid', 'ProteinBid', 'weight'], compression=None)
    if filter_0:
        sknn = sknn[sknn['weight'] > 0]

    if weighted:
        weight = 'weight'
    else:
        weight = None
    
    sknn_graph = ig.Graph.DataFrame(sknn, directed=directed, use_vids=False)
    transitivities = sknn_graph.transitivity_local_undirected(mode='zero')
    # sknn_graph.vs['transitivity'] = [max(t, 0.0001) for t in transitivities]

    # infomap_clusters = sknn_graph.community_infomap(edge_weights=weight, vertex_weights='transitivity', trials=n_trials)    
    infomap_clusters = sknn_graph.community_infomap(edge_weights=weight, trials=n_trials)    
    
    if return_clusters:
        return infomap_clusters

    k = str()
    for letter in file[-6::-1]:
        if letter == 'S':
            break
        k += letter

    cluster_sizes = infomap_clusters.sizes()
    cluster_properties['k'] = int(k[::-1])
    cluster_properties['n_edges'] = sknn_graph.ecount()
    cluster_properties['n_nodes'] = sknn_graph.vcount()
    cluster_properties['cluster_coeff'] = sknn_graph.transitivity_avglocal_undirected()
    cluster_properties['n_clusters'] = infomap_clusters.__len__()
    cluster_properties['mean_size'] = mean(cluster_sizes)
    cluster_properties['median_size'] = median(cluster_sizes)
    cluster_properties['max_size'] = max(cluster_sizes)
    return cluster_properties

sknn_properties = list()
for file in os.listdir(path):

    sknn_properties.append(sknn_infomap_info(file, filter_0=True, weighted=True, n_trials=20))
sknn_properties = pd.DataFrame.from_records(sknn_properties)
sknn_properties.sort_values('k')

Unnamed: 0,k,n_edges,n_nodes,cluster_coeff,n_clusters,mean_size,median_size,max_size
1,10,169821,11851,0.350985,31,382.290323,3.0,11740
3,25,386371,12416,0.524328,14,886.857143,3.0,12368
5,50,467238,12776,0.66487,9,1419.555556,2.0,12756
4,75,482162,12959,0.696896,7,1851.285714,2.0,12945
6,100,486526,13063,0.702693,9,1451.444444,3.0,13042
7,250,494592,13360,0.703843,10,1336.0,2.5,13320
8,500,496999,13481,0.702534,9,1497.888889,2.0,13456
0,750,497825,13557,0.703774,7,1936.714286,2.0,13538
2,1000,498217,13580,0.705739,6,2263.333333,2.5,13563


In [68]:
clusters_vs_trials = dict()
for i in range(1,100, 10):
    clusters = sknn_infomap_info('FC6.0_H.sapiens_REGULOME-S250NN.gz', filter_0=True, weighted=True, return_clusters=True, n_trials=i)
    clusters_vs_trials[i] = clusters.__len__()

In [72]:
clusters_vs_trials.values()

dict_values([10, 10, 10, 10, 10, 10, 10, 10, 10, 10])