Regulome Clustering with SKNN and Infomap

In [1]:
import pandas as pd
import igraph as ig
import seaborn as sns
from math import log10
from statistics import mean, median
import os

path = "./files/sknn_networks/"

In [103]:
sknn = pd.read_csv(filepath_or_buffer=path+"FC6.0_H.sapiens_REGULOME-S10NN.gz", sep=' ', header=None, names=['ProteinAid', 'ProteinBid', 'weight'], compression=None)
sknn = sknn[sknn['weight'] > 0]
sknn_graph = ig.Graph.DataFrame(sknn, directed=True, use_vids=False)

In [75]:
def sknn_infomap_info(file, filter_0=True, weighted=True, directed=False, n_trials=10, return_clusters=False):
    cluster_properties = dict()
    
    sknn = pd.read_csv(filepath_or_buffer=path+file, sep=' ', header=None, names=['ProteinAid', 'ProteinBid', 'weight'], compression=None)
    if filter_0:
        sknn = sknn[sknn['weight'] > 0]

    if weighted:
        weight = 'weight'
    else:
        weight = None
    
    sknn_graph = ig.Graph.DataFrame(sknn, directed=directed, use_vids=False)

    infomap_clusters = sknn_graph.community_infomap(edge_weights=weight, trials=n_trials)    
    
    if return_clusters:
        return infomap_clusters

    k = str()
    for letter in file[-6::-1]:
        if letter == 'S':
            break
        k += letter

    cluster_sizes = infomap_clusters.sizes()
    cluster_properties['k'] = int(k[::-1])
    cluster_properties['n_edges'] = sknn_graph.ecount()
    cluster_properties['n_nodes'] = sknn_graph.vcount()
    cluster_properties['cluster_coeff'] = sknn_graph.transitivity_avglocal_undirected()
    cluster_properties['n_clusters'] = infomap_clusters.__len__()
    cluster_properties['mean_size'] = mean(cluster_sizes)
    cluster_properties['median_size'] = median(cluster_sizes)
    cluster_properties['max_size'] = max(cluster_sizes)
    return cluster_properties


In [81]:
sknn_properties = list()
for file in os.listdir(path):
    if len(sknn_properties) == 4:
        break
    sknn_properties.append(sknn_infomap_info(file, filter_0=True, weighted=True, n_trials=20))
sknn_properties = pd.DataFrame.from_records(sknn_properties)
sknn_properties.sort_values('k')

Unnamed: 0,k,n_edges,n_nodes,cluster_coeff,n_clusters,mean_size,median_size,max_size
1,10,169821,11851,0.350985,29,408.655172,4.0,11746
3,25,386371,12416,0.524328,14,886.857143,3.0,12368
0,750,497825,13557,0.703774,7,1936.714286,2.0,13538
2,1000,498217,13580,0.705739,6,2263.333333,2.5,13563


Unnamed: 0,k,n_edges,n_nodes,cluster_coeff,n_clusters,mean_size,median_size,max_size
1,10,501720,15041,0.678399,15041,4.66243,1.0,11704
3,25,501720,15041,0.678399,15041,5.693036,1.0,12340
5,50,501720,15041,0.678399,15041,6.608524,1.0,12716
4,75,501720,15041,0.678399,15041,7.186335,1.0,12919
6,100,501720,15041,0.678399,15041,7.565895,1.0,13040
7,250,501720,15041,0.678399,15041,8.88948,1.0,13315
8,500,501720,15041,0.678399,15041,9.580255,1.0,13453
0,750,501720,15041,0.678399,15041,10.08786,1.0,13537
2,1000,501720,15041,0.678399,15041,10.245913,1.0,13560


In [68]:
clusters_vs_trials = dict()
for i in range(1,100, 10):
    clusters = sknn_infomap_info('FC6.0_H.sapiens_REGULOME-S250NN.gz', filter_0=True, weighted=True, return_clusters=True, n_trials=i)
    clusters_vs_trials[i] = clusters.__len__()

In [72]:
clusters_vs_trials.values()

dict_values([10, 10, 10, 10, 10, 10, 10, 10, 10, 10])