Regulome Clustering with SKNN and Infomap

In [3]:
import pandas as pd
import igraph as ig
import seaborn as sns
from math import log10
from statistics import mean, median
import os
from compress_pickle import dump, load

sknn_path = "./files/sknn_networks/"

regulome_network_edges = pd.read_csv(filepath_or_buffer='./files/human_regulome_pd.gz', compression='infer')
regulome_network_edges = regulome_network_edges.astype({'ProteinAid': 'str', 'ProteinBid':'str'})

regulome_network_graph = ig.Graph.DataFrame(regulome_network_edges, directed=True, use_vids = False)
regulome_network_graph_undirected = ig.Graph.DataFrame(regulome_network_edges, directed=False, use_vids = False)

path = './files/'

def open_pickle(file):
    with open(path+file, 'rb') as pickle_file:
        return load(path=pickle_file, compression='infer')

In [4]:
# infomap_clusters_undirected = regulome_network_graph_undirected.community_infomap(edge_weights='PPV', trials=500)
# with open('./files/infomap_undirected.gz', 'wb') as result_pkl:
#         dump(infomap_clusters_undirected, result_pkl, compression='infer')

# infomap_clusters_directed = regulome_network_graph.community_infomap(edge_weights='PPV', trials=500)
# with open('./files/infomap_directed.gz', 'wb') as result_pkl:
#         dump(infomap_clusters_directed, result_pkl, compression='infer')

In [6]:
infomap_clusters_directed = open_pickle('infomap_directed.gz')
infomap_clusters_undirected = open_pickle('infomap_undirected.gz')

In [11]:
def infomap_info(clusters):
    cluster_sizes = clusters.sizes()
    print(f"Number of clusters: {len(cluster_sizes)}")
    print(f"Mean cluster size: {mean(cluster_sizes)}")
    print(f"Median cluster size: {median(cluster_sizes)}")
    print(f"Max cluster size: {max(cluster_sizes)}")

In [13]:
infomap_info(infomap_clusters_directed)

Number of clusters: 39
Mean cluster size: 385.6666666666667
Median cluster size: 6
Max cluster size: 14657


In [17]:
def sknn_infomap_info(file, filter_0=True, weighted=True, directed=False, n_trials=10, return_clusters=False):
    cluster_properties = dict()
    
    sknn = pd.read_csv(filepath_or_buffer=sknn_path+file, sep=' ', header=None, names=['ProteinAid', 'ProteinBid', 'weight'], compression=None)
    if filter_0:
        sknn = sknn[sknn['weight'] > 0]

    if weighted:
        weight = 'weight'
    else:
        weight = None
    
    sknn_graph = ig.Graph.DataFrame(sknn, directed=directed, use_vids=False)
    infomap_clusters = sknn_graph.community_infomap(edge_weights=weight, trials=n_trials)    
    
    if return_clusters:
        return infomap_clusters

    k = str()
    for letter in file[-6::-1]:
        if letter == 'S':
            break
        k += letter

    cluster_sizes = infomap_clusters.sizes()
    cluster_properties['k'] = int(k[::-1])
    cluster_properties['n_edges'] = sknn_graph.ecount()
    cluster_properties['n_nodes'] = sknn_graph.vcount()
    cluster_properties['cluster_coeff'] = sknn_graph.transitivity_avglocal_undirected()
    cluster_properties['n_clusters'] = infomap_clusters.__len__()
    cluster_properties['mean_size'] = mean(cluster_sizes)
    cluster_properties['median_size'] = median(cluster_sizes)
    cluster_properties['max_size'] = max(cluster_sizes)
    print(k)
    return cluster_properties

sknn_properties = list()
for file in os.listdir(sknn_path):

    sknn_properties.append(sknn_infomap_info(file, filter_0=True, directed=False, weighted=True, n_trials=500))
sknn_properties = pd.DataFrame.from_records(sknn_properties)
sknn_properties.sort_values('k')

001
052
05
57
005
0001
52
057
01


Unnamed: 0,k,n_edges,n_nodes,cluster_coeff,n_clusters,mean_size,median_size,max_size
8,10,169821,11851,0.350985,31,382.290323,3.0,11739
6,25,386371,12416,0.524328,14,886.857143,3.0,12368
2,50,467238,12776,0.66487,9,1419.555556,2.0,12756
3,75,482162,12959,0.696896,8,1619.875,2.5,12942
0,100,486526,13063,0.702693,9,1451.444444,3.0,13042
1,250,494592,13360,0.703843,10,1336.0,2.5,13320
4,500,496999,13481,0.702534,9,1497.888889,2.0,13456
7,750,497825,13557,0.703774,7,1936.714286,2.0,13538
5,1000,498217,13580,0.705739,6,2263.333333,2.5,13563


In [16]:
with open('./files/infomap_sknn_results_directed.gz', 'wb') as result_pkl:
        dump(sknn_properties, result_pkl, compression='infer')

In [68]:
clusters_vs_trials = dict()
for i in range(1,100, 10):
    clusters = sknn_infomap_info('FC6.0_H.sapiens_REGULOME-S250NN.gz', filter_0=True, weighted=True, return_clusters=True, n_trials=i)
    clusters_vs_trials[i] = clusters.__len__()

In [None]:
clusters_vs_trials.values()