In [1]:
import pandas as pd
import igraph as ig
from ease import EASE
import seaborn as sns
import numpy as np
from compress_pickle import load, dump
import multiprocessing
import itertools as it
import math

path = './files/'

In [2]:
# function to load pickle files
def open_pickle(file):
    with open(path+file, 'rb') as pickle_file:
        return load(path=pickle_file, compression='infer')


In [3]:
# get network data with pd
regulome_network_edges = pd.read_csv(filepath_or_buffer='./files/human_regulome_pd.gz', compression='infer')
regulome_network_edges = regulome_network_edges.astype({'ProteinAid': 'str', 'ProteinBid':'str'})

regulome_network_graph = ig.Graph.DataFrame(regulome_network_edges, directed=True, use_vids = False)
regulome_network_graph_undirected = ig.Graph.DataFrame(regulome_network_edges, directed=False, use_vids = False)


In [33]:
# load gene set dictionaries - msigdb
msigdb_c6_dict = open_pickle('msigdb_c6_all_dict.pkl')
msigdb_c2_cgp_dict = open_pickle('msigdb_c2_cgp_dict.pkl')
msigdb_c3_tft_dict = open_pickle('msigdb_c3_tft_dict.pkl')

In [1]:
# Investigate Leiden resolution
cov = regulome_network_graph.vcount() # what should I set this to??

def cluster_leiden(graph, res, b=0.01, n_iter=3):
    return graph.community_leiden(objective_function='modularity',
                                weights='PPV',
                                resolution=res,
                                beta=b,
                                n_iterations=n_iter) # only few small clusters and 1 large one

def cluster_louvain(graph, res):
    return graph.community_multilevel(weights='PPV',
                                    return_levels=False,
                                    resolution=res)

def set_enrichment(res, beta, gene_set):
    print(f"Resolution: {res}, beta {beta}")
    clusters = cluster_leiden(regulome_network_graph_undirected, res=res, b=beta)

    enrichment_dict = dict()
    for (cluster_n, cluster) in enumerate(clusters.subgraphs()):
        query = cluster.vs['name']
        # if (len(query) > 2000) or (len(query) < 3):
        #     continue
        enrichment = EASE(query_set=query, genesets=gene_set, PT=cov)
        enrichment_dict[cluster_n] = (enrichment, len(query), query) # maybe save this as a namedtuple with enrihcment and subgraph
    return res, beta, enrichment_dict, len(enrichment_dict)

# collect_enrichment(set_enrichment(2))
def collect_enrichment(enrichment):
 # pool returns values in a list, therefore Have tuple in a list and have to extract
    cluster_enrichment_dict.setdefault(enrichment[0], dict())[enrichment[1]] = enrichment[2:]
    
def error_function():
    print(f"Error, could not compute {e}")


def cluster_EASE(gene_set, filename, beta_range=[0.001, 0.005, 0.01, 0.05, 0.1], res_range = range(2, 16, 1), num_processes=6):
    global cluster_enrichment_dict
    cluster_enrichment_dict = dict() # output is a dictionary (resolution) of tuple with dictionary (per cluster) containing tuple with a pandas df (containing enrichment fr each pathway),
    try:
        with multiprocessing.Pool(num_processes) as pool: # Have a look at  joblib, dask
            for res, beta in it.product(res_range, beta_range):
                pool.apply_async(func=set_enrichment, args=(res,beta, gene_set), callback=collect_enrichment, error_callback=error_function)
        pool.close()
        pool.join()

    except KeyboardInterrupt:
        pool.close()
        pool.terminate()
    
    with open(path+'clustering_results/'+filename, 'wb') as result_pkl:
        dump(cluster_enrichment_dict, result_pkl, compression='infer')

    del cluster_enrichment_dict

NameError: name 'regulome_network_graph' is not defined

In [None]:
# Run the clustering in this cell

# to_cluster = {
    # 'leiden_2_15_0001_05_msigdb_c2_cgp.gz': misgdb_c2_cgp_dict,
    # 'leiden_2_15_0001_05_msigdb_c3_tft.gz': misgdb_c3_tft_dict,
    # 'leiden_2_15_0001_05_msigdb_c6_all.gz': msigdb_c6_dict
    # }
# 
# for file, gene_set in to_cluster.items():
    # print(file)
    # cluster_EASE(gene_set=gene_set, filename=file)