Variables to be updated/configured:

In [None]:
WES = True # False if running for the larger epilepsy-autism multiplex network, True if running for the WES multiplex network

if WES:
    COMS_DIR = "./../communities_wes" # path to directory containing information on the communities in the network
else:
    COMS_DIR = "./../communities" # path to directory containing information on the communities in the network
    
GRAPH_DIR = "./../gexf_files" # path to directory where the .gexf files are located
GENE_SETS_DIR = "./../gene_sets" # path to directory containing .csv files with genes sets
PHENOTYPES_DIR = "./../phenotypes" # path to directory containing .csv files with epilepsy and autism phenotypes
NET_GEN_DIR = "./../network_generation" # path to directory for network generation


# Setup

In [None]:
# network packages
import networkx as nx
from networkx.readwrite.gexf import read_gexf

# other packages
import os, random
import numpy as np
import pandas as pd
from scipy.stats import hypergeom

In [None]:
if WES:
    gene_phenotype_filename = 'gene-phenotype-wes-1-500-update.gexf'
    gene_ppi_filename = "gene-ppi-wes-700-update.gexf"
    gene_union_filename = 'gene-union-wes.gexf'
else:
    gene_phenotype_filename = 'gene-phenotype-1-1000-update.gexf'
    gene_ppi_filename = 'gene-ppi-700-update.gexf'
    gene_union_filename = 'gene-union.gexf'
    
gene_phenotype = read_gexf(os.path.join(GRAPH_DIR, gene_phenotype_filename))
gene_ppi = read_gexf(os.path.join(GRAPH_DIR, gene_ppi_filename))
gene_union = read_gexf(os.path.join(GRAPH_DIR, gene_union_filename))

In [None]:
# wrapper for communities
class Coms:
    def __init__(self, communities):
        self.communities = communities
        self.overlap = None
        
# get Coms class with genes from annotated networkx graph
def get_coms_from_graph(G):    
    max_module = max([G.nodes[node]['module'] for node in G.nodes])
    partition = []
    for i in range(max_module):
        partition.append([])
    for node in G.nodes:
        mod = G.nodes[node]['module']
        partition[mod-1] = partition[mod-1] + [node]
    coms = Coms(partition)
    return coms

In [None]:
coms_ppi = get_coms_from_graph(gene_ppi)
coms_phenotype = get_coms_from_graph(gene_phenotype)
coms_multiplex = get_coms_from_graph(gene_union)

In [None]:
# standardize gene names
def update_genes(df, gene_col):
    new_df = pd.DataFrame()
    for i, row in df.iterrows():
        replacements = {
            "ND1": "MT-ND1",
            "ND4": "MT-ND4",
            "TRNR1": "GFRA1",
            "CCM1": "KRIT1",
            "C19orf61": "SMG9",
            "EIF2C4": "AGO4",
            "HOXD": ["HOXD1", "HOXD3", "HOXD4", "HOXD8", "HOXD9", "HOXD10", "HOXD11", "HOXD12", "HOXD13"],
            "ATP6": "MT-ATP6",
            "APOE4": "APOE",
            "ENSG00000173575": "CHD2",
            "SCA2": "ATXN2",
            "B3GNT1": "B4GAT1",
            "COX3": "MT-CO3",
            "ENSG00000086848": "ALG9",
            "ATP8": "MT-ATP8",
            "ND5": "MT-ND5",
            "C2orf25": "MMADHC",
            "PIG6": "PRODH",
            "ENSG00000258947": "TUBB3",
            "ADCK3": "COQ8A", 
            "COX1": "MT-CO1",
            "DXS423E": "SMC1A",
            "PCDHG": ["PCDHGA1", "PCDHGA2", "PCDHGA3", "PCDHGA4", "PCDHGA5", "PCDHGA6", "PCDHGA7", "PCDHGA8", "PCDHGA9", "PCDHGA10", "PCDHGA11", "PCDHGA12", \
                      "PCDHGB1", "PCDHGB2", "PCDHGB3", "PCDHGB4", "PCDHGB5", "PCDHGB6", "PCDHGB7", "PCDHGC3", "PCDHGC4", "PCDHGC5"],
            "KIAA0226": "RUBCN",
            "CYTB": "MT-CYB",
            "KIAA0442": "AUTS2",
            "KAL1": "ANOS1",
            "BRP44L": "MPC1",
            "KIAA1715": "LNPK",
            "JMJD2C": "KDM4C",
            "CCDC64": "BICDL1" ,
            "KIAA2022": "NEXMIF",
            "INADL": "PATJ",
            "PIG6": "PRODH",
            "PARK2": "PRKN",
            "NDNL2": "NSMCE3",
            "BZRAP1": "TSPOAP1",
            "ERBB2IP": "ERBIN",
            "HIST1H2BJ": "H2BC11",
            "ADSS": "ADSS2",
            "C15orf43": "TERB2",
            "C16orf13": "METTL26",
            "C11orf30": "EMSY",
            "SUV420H1": "KMT5B",
            "MKL2": "MRTFB",
            "ENSG00000259159": "MFRP",
            "MARCA2":"SMARCA2",
            "C11orf82": "DDIAS",
            "CSNK2B-LY6G5B-1181": "CSNK2B",
            'TCAF1': 'FAM115A', 
            'KCNMB2': 'ENSG00000275163', 
            'KIAA1009': 'CEP162', 
            'AGMO(alsoknownasTMEM195)': 'AGMO', 
            'PPIEL': 'PPIEL', 
            'GGTA1P': 'GGTA1', 
            'KIAA1239': 'NWD2', 
            'LINC01370': 'LINC01370', 
            'PCDHA@': ['PCDHA10', 'PCDHA9', 'PCDHA5', 'PCDHA11', 'PCDHA7', 'PCDHA3', 'PCDHA8', 'PCDHA2', 'PCDHA1', 'PCDHA13', 'PCDHA4', 'PCDHA6', 'PCDHA12'], 
            'MsrA': 'MSRA', 
            'DGCR6': 'ENSG00000183628', 
            'ZNF259': 'ZPR1', 
            'ADGRA2': 'GPR124', 
            'KIAA1430': 'CFAP97',     
            'RNASE4': 'ENSG00000258818', 
            'C14orf166B': 'LRRC74A', 
            "RP11-1055B8.7": "BAHCC1",
            "ENSG00000272414": "FAM47E-STBD1",
            "C5orf20": "DCANP1",
            "SOGA2": "MTCL1",
            "FAM194A": "ERICH6"
        }
        
        gene = row[gene_col]
        if gene in replacements:
            replacement = replacements[gene]
            if isinstance(replacement, str):
                print("Replaced", gene, "with", replacement)
                row[gene_col] = replacement
                new_df = new_df.append(row)
            else:
                for j in range(len(replacement)):
                    print("Replaced", gene, "with", replacement[j])
                    row[gene_col] = replacement[j]
                    new_df = new_df.append(row)
        else:
            new_df = new_df.append(row)
                    
    return new_df

In [None]:
info_df = pd.read_csv(os.path.join(NET_GEN_DIR, "9606.protein.info.v11.0.txt"), sep='\t')
info_df = update_genes(info_df, "preferred_name")
PPI_all_genes = set(info_df['preferred_name'])

# Import gene lists

In [None]:
epilepsy_genes_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "epilepsy_genes_wang_2017_formatted.csv"))
epilepsy_genes_df = update_genes(epilepsy_genes_df, 'gene')
epilepsy_genes = set(epilepsy_genes_df['gene'])

In [None]:
autism_genes_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "SFARI-Gene_genes_01-03-2020release_01-05-2020export.csv"))
autism_genes_df = update_genes(autism_genes_df, 'gene-symbol')
autism_genes = set(autism_genes_df['gene-symbol'])

In [None]:
autism_wes_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "WES_autism_Satterstrom_2020.csv"), nrows=102)
autism_wes_df = update_genes(autism_wes_df, "gene")
autism_wes_genes = set(autism_wes_df['gene'])

In [None]:
epilepsy_wes_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "WES_EPI_gene_burden_AC_1_Epi25_Collaborative_2019.csv"), nrows=200, skiprows=2)
epilepsy_wes_df = update_genes(epilepsy_wes_df, 'Gene')
epilepsy_wes_genes = set(epilepsy_wes_df['Gene'])

In [None]:
e1 = set(epilepsy_genes_df[epilepsy_genes_df['score']==1]['gene'])
e2 = set(epilepsy_genes_df[epilepsy_genes_df['score']==2]['gene'])
e3 = set(epilepsy_genes_df[epilepsy_genes_df['score']==3]['gene'])
e4 = set(epilepsy_genes_df[epilepsy_genes_df['score']==4]['gene'])

a1 = set(autism_genes_df[autism_genes_df['gene-score']==1]['gene-symbol'])
a2 = set(autism_genes_df[autism_genes_df['gene-score']==2]['gene-symbol'])
a3 = set(autism_genes_df[autism_genes_df['gene-score']==3]['gene-symbol'])
a_s = set(autism_genes_df[autism_genes_df['syndromic']==1]['gene-symbol'])

common_genes = e1.intersection(a1)
common_all_genes = epilepsy_genes.intersection(autism_genes)
common_wes_genes = autism_wes_genes.intersection(epilepsy_wes_genes)

In [None]:
disease_genes_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "disease_genes.csv"))

# get schizophrenia genes
schiz_genes = set()
for row in disease_genes_df.iloc[0:2]["genes"]:
    genes = set(row.split(","))
    schiz_genes = schiz_genes.union(genes.intersection(PPI_all_genes))
    
schiz_df = pd.DataFrame({"gene": list(schiz_genes)})
schiz_df = update_genes(schiz_df, "gene")
schiz_genes = set(schiz_df["gene"])
# only consider genes in the STRING database since there are many pseudogenes/RNA genes
schiz_genes = schiz_genes.intersection(PPI_all_genes) 

# get bipolar disorder genes
row = disease_genes_df.iloc[2]["genes"]
bp_df = pd.DataFrame({"gene": row.split(",")})
bp_df = update_genes(bp_df, "gene")
bipolar_genes = set(bp_df["gene"])
bipolar_genes = bipolar_genes.intersection(PPI_all_genes)

# get intellectual disability genes
row = disease_genes_df.iloc[3]["genes"]
id_df = pd.DataFrame({"gene": row.split(",")})
id_df = update_genes(id_df, "gene")
id_genes = set(id_df["gene"])
id_genes = id_genes.intersection(PPI_all_genes)

In [None]:
if WES:
    degs_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "BE_genes_WES.csv")).rename(columns={'Unnamed: 0': 'Row.names'})
else:
    degs_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "BE_genes.csv"), index_col=0)
degs_df = degs_df[degs_df["logFC"] > 0] # positive differential expression
degs = set(degs_df["Row.names"])

In [None]:
if WES:
    genes_df = pd.read_csv(os.path.join(NET_GEN_DIR, 'genes_wes.txt'))
else:
    genes_df = pd.read_csv(os.path.join(NET_GEN_DIR, 'genes.txt'))

gene_symbols = set(genes_df['gene_symbol'])
gene_ids = set(genes_df['entrez_id'])

In [None]:
# get phenotype associations
phenotype_to_gene_df = pd.read_csv('phenotype_to_genes.txt', sep='\t', skiprows=1, header=None) # gene-phenotype associations from JAX
phenotype_to_gene_df.columns = ['phenotype', 'name', 'entrezid', 'gene', 'additional_info', 'source', 'disease_id']

gene_phenotype_df = phenotype_to_gene_df[phenotype_to_gene_df['entrezid'].isin(gene_ids)]
phenotypes_in_network = set(gene_phenotype_df['phenotype']) # aggregation of all phenotypes associated with genes in the network

epilepsy_phenotypes_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, "epilepsy_phenotypes.csv")) # epilepsy phenotypes (HPO subtree with root Autistic behavior HP:0000729)
autism_phenotypes_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, "autism_phenotypes.csv")) # autism phenotypes (HPO subtree with root Seizure HP:0001250)
epilepsy_phenotypes = set(epilepsy_phenotypes_df['HPO'])
autism_phenotypes = set(autism_phenotypes_df['HPO'])
all_phenotypes = phenotypes_in_network.union(autism_phenotypes).union(epilepsy_phenotypes) # phenotypes in network + epilepsy and autism phenotypes
ea_phenotypes = sorted(list(epilepsy_phenotypes)) + sorted(list(autism_phenotypes))

gene_phenotype_df = phenotype_to_gene_df[phenotype_to_gene_df['phenotype'].isin(all_phenotypes)]

# Enrichment tests

In [None]:
def save_coms_to_csv(coms, filename):
    df = pd.DataFrame()
    for mod_num, com in enumerate(coms, 1):
        for g in com:
            df = df.append([[mod_num, g]], ignore_index=True)
    df.columns = ['module', 'gene']
    df.to_csv(filename, index=False)

In [None]:
save_coms_to_csv(coms_phenotype.communities, os.path.join(COMS_DIR, 'coms_phenotype.csv'))
save_coms_to_csv(coms_ppi.communities, os.path.join(COMS_DIR, 'coms_ppi.csv'))
save_coms_to_csv(coms_multiplex.communities, os.path.join(COMS_DIR, 'coms_multiplex.csv'))

In [None]:
# test enrichment in phenotypes using Jax gene-HPO associations
def test_phenotype_enrichment(coms, phenotype, label, all_genes=False):
    hpo_enrichments = []
    
    if all_genes:
        M = len(PPI_all_genes) # population size/total number of genes
    else:
        M = sum([len(com) for com in coms]) # population size/total number of genes
        
    hpo_genes = set(gene_phenotype_df[gene_phenotype_df['phenotype'].isin(phenotype)]['gene'])
    n = len(hpo_genes) # number of genes total with that hpo

    for mod_num, com in enumerate(coms, 1):
        x = len(hpo_genes.intersection(com)) # number of genes in com with give hpo association
        N = len(com) # sample size
        
        pval = hypergeom.sf(x-1, M, n, N) # enrichment hypergeometric test
        hpo_enrichments.append([mod_num, label, pval])
                
    return hpo_enrichments

In [None]:
# test enrichment in certain types of genes
def test_gene_enrichment(coms, genes, label, all_genes=False):
    hpo_enrichments = []
    
    if all_genes:
        M = len(PPI_all_genes) # population size/total number of genes
        n = len(genes) # number of genes total of given type
    else:
        M = sum([len(com) for com in coms]) # population size/total number of genes
        match_genes = set([item for sublist in coms for item in sublist])
        n = len(match_genes.intersection(set(genes))) # number of genes total of given type
    
    for mod_num, com in enumerate(coms, 1):
        x = len(genes.intersection(com)) # number of genes in com of given type
        N = len(com) # sample size

        pval = hypergeom.sf(x-1, M, n, N) # enrichment hypergeometric test
        hpo_enrichments.append([mod_num, label, pval])
                
    return hpo_enrichments

In [None]:
# test for enrichment
def enrichment_test(coms, all_genes=False, hpo_list=None):
    
    hpo_enrichments = []
    if hpo_list:
        n = len(hpo_list)
        for idx, hpo in enumerate(hpo_list):
            if n > 1000 and idx % 1000 == 0:
                print(f"Calculating Jax HPO enrichment: {idx}/{n} HPO IDs")
            hpo_enrichments = hpo_enrichments + test_phenotype_enrichment(coms, [hpo], hpo, all_genes=all_genes)
    
    epilepsy_gene_enrichments = test_gene_enrichment(coms, epilepsy_genes, "epilepsy_genes", all_genes)
    e1_enrichments = test_gene_enrichment(coms, e1, "e1_genes", all_genes)
    e2_enrichments = test_gene_enrichment(coms, e2, "e2_genes", all_genes)
    e3_enrichments = test_gene_enrichment(coms, e3, "e3_genes", all_genes)
    e4_enrichments = test_gene_enrichment(coms, e4, "e4_genes", all_genes)
    epilepsy_wes_gene_enrichments = test_gene_enrichment(coms, epilepsy_wes_genes, "epilepsy_WES_genes", all_genes)
    
    epilepsy_enrichments = epilepsy_gene_enrichments + e1_enrichments + e2_enrichments + e3_enrichments + e4_enrichments + epilepsy_wes_gene_enrichments
    
    autism_gene_enrichments = test_gene_enrichment(coms, autism_genes, "autism_genes", all_genes)
    a1_enrichments = test_gene_enrichment(coms, a1, "a1_genes", all_genes)
    a2_enrichments = test_gene_enrichment(coms, a2, "a2_genes", all_genes)
    a3_enrichments = test_gene_enrichment(coms, a3, "a3_genes", all_genes)
    as_enrichments = test_gene_enrichment(coms, a_s, "as_genes", all_genes)
    autism_wes_gene_enrichments = test_gene_enrichment(coms, autism_wes_genes, "autism_WES_genes", all_genes)

    autism_enrichments = autism_gene_enrichments = a1_enrichments + a2_enrichments + a3_enrichments + as_enrichments + autism_wes_gene_enrichments
    
    common_gene_enrichments = test_gene_enrichment(coms, common_genes, "common_genes", all_genes)
    common_all_gene_enrichments = test_gene_enrichment(coms, common_all_genes, "common_genes_all", all_genes)
    common_wes_gene_enrichments = test_gene_enrichment(coms, common_wes_genes, "common_wes_genes", all_genes)
    common_enrichments = common_gene_enrichments + common_all_gene_enrichments + common_wes_gene_enrichments
    
    degs_enrichments = test_gene_enrichment(coms, degs, "DE_genes", all_genes)
    schiz_gene_enrichments = test_gene_enrichment(coms, schiz_genes, "schizophrenia_genes", all_genes)
    bipolar_gene_enrichments = test_gene_enrichment(coms, bipolar_genes, "BD_genes", all_genes)
    id_gene_enrichments = test_gene_enrichment(coms, id_genes, "ID_genes", all_genes)
    other_enrichments = degs_enrichments + schiz_gene_enrichments + bipolar_gene_enrichments + id_gene_enrichments
    
    all_enrichments = hpo_enrichments + epilepsy_enrichments + autism_enrichments + common_enrichments + other_enrichments
    
    df = pd.DataFrame(all_enrichments)
    df.columns = ["module", "label", "pval"]
    df = df.sort_values(by=["module", "label", "pval"])
    
    return df

### Enrichment tests for phenotype network layer modules

In [None]:
coms_phenotype_enrichment_df = enrichment_test(coms_phenotype.communities, all_genes=False, hpo_list=ea_phenotypes)
coms_phenotype_enrichment_df = coms_phenotype_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_phenotype_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_phenotype_enrichment.csv'), index=False)

In [None]:
coms_phenotype_enrichment_df = enrichment_test(coms_phenotype.communities, all_genes=True, hpo_list=ea_phenotypes)
coms_phenotype_enrichment_df = coms_phenotype_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_phenotype_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_phenotype_enrichment_all_genes.csv'), index=False)

### Enrichment tests for PPI network layer modules

In [None]:
coms_ppi_enrichment_df = enrichment_test(coms_ppi.communities, all_genes=False, hpo_list=ea_phenotypes)
coms_ppi_enrichment_df = coms_ppi_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_ppi_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_ppi_enrichment.csv'), index=False)

In [None]:
coms_ppi_enrichment_df = enrichment_test(coms_ppi.communities, all_genes=True, hpo_list=ea_phenotypes)
coms_ppi_enrichment_df = coms_ppi_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_ppi_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_ppi_enrichment_all_genes.csv'), index=False)

### Enrichment tests for multiplex network modules

In [None]:
coms_multiplex_enrichment_df = enrichment_test(coms_multiplex.communities, all_genes=False, hpo_list=ea_phenotypes)
coms_multiplex_enrichment_df = coms_multiplex_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_multiplex_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_multiplex_enrichment.csv'), index=False)

In [None]:
coms_multiplex_enrichment_df = enrichment_test(coms_multiplex.communities, all_genes=True, hpo_list=ea_phenotypes)
coms_multiplex_enrichment_df = coms_multiplex_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_multiplex_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_multiplex_enrichment_all_genes.csv'), index=False)

In [None]:
# test enrichment for all HPO using background of all genes
coms_multiplex_enrichment_df = enrichment_test(coms_multiplex.communities, all_genes=True, hpo_list=list(phenotypes_in_network))
coms_multiplex_enrichment_df = coms_multiplex_enrichment_df.merge(gene_phenotype_df[['phenotype', 'name']].drop_duplicates(), left_on="label", right_on="phenotype", how="left").drop('phenotype', axis=1).rename(columns={'name':'phenotype_name'})
coms_multiplex_enrichment_df.to_csv(os.path.join(COMS_DIR, 'coms_multiplex_enrichment_all_hpo_all_genes.csv'), index=False)

### Experimental HPO enrichment

In [None]:
def format_gene_phenotype_asscoiations_matrix(df):
    df = df[~df['HPO'].isin(ignore)]
    df["HPO"] = df["HPO"].str.replace("_", ":")
    df = df[["Gene", "HPO", "final_score"]]
    df = pd.pivot_table(df, values='final_score', index=['Gene'],columns=['HPO'])
    df = df.fillna(0)
    df = df.loc[:, df.columns.isin(ea_phenotypes)]
    return df

In [None]:
# HPO associations for each gene in the network

# all genes in the network as enrichment background
if WES:
    hpo_associations = pd.read_csv(os.path.join(NET_GEN_DIR, 'hpo_association_scores_wes.csv'))
else:
    hpo_associations = pd.read_csv(os.path.join(NET_GEN_DIR, 'hpo_association_scores.csv'))
    
# all genes in Phen2Gene knowledge base as enrichment background
hpo_associations_all = pd.read_csv(os.path.join(NET_GEN_DIR, 'hpo_association_scores_all.csv'))

ignore_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, 'ignore_phenotypes.csv'))
ignore = set(ignore_df['phenotype'])

all_genes = set(hpo_associations_all["Gene"])
all_genes = list(all_genes)

hpo_associations = format_gene_phenotype_asscoiations_matrix(hpo_associations)
hpo_associations_all = format_gene_phenotype_asscoiations_matrix(hpo_associations_all)
network_nodes = list(gene_ppi.nodes) # PPI network, phenotype network, and multiplex network have same nodes

In [None]:
def calculate_phenotype_enrichment(coms, output_path, all_genes_background=False, num_trials=1000):
    result = np.zeros((len(list(hpo_associations.columns)), len(coms)))
    for mod_num, com in enumerate(coms):     
        com_size = len(com)
        if WES and com_size < 5:
            break
        if not WES and com_size < 20:
            break
        print('Module', mod_num + 1)
        
        if all_genes_background:
            hpo_actual = hpo_associations_all[hpo_associations_all.index.isin(com)]
        else:
            hpo_actual = hpo_associations[hpo_associations.index.isin(com)]
        hpo_actual = np.mean(hpo_actual, axis=0)

        for i in range(num_trials):
            if i % 100 == 0:
                print(f"Trial {i}/{num_trials}")
            if all_genes_background: # sample from all genes in Phen2Gene knowledge base
                sample = random.sample(all_genes, com_size)
                hpo_sample = hpo_associations_all[hpo_associations_all.index.isin(sample)]
            else:
                sample = random.sample(network_nodes, com_size)
                hpo_sample = hpo_associations[hpo_associations.index.isin(sample)]
            hpo_sample = np.mean(hpo_sample, axis=0)
            result[:,mod_num] = result[:,mod_num] + np.less(hpo_actual, hpo_sample)

    result_df = pd.DataFrame(result)
    result_df.columns = list(np.arange(1, len(coms)+1))
    result_df.index = list(hpo_associations.columns)
    temp = result_df.reset_index()
    result_df = pd.melt(temp, id_vars=['index'], value_vars=list(result_df.columns))
    result_df.columns = ["HPO", "module", "p_val"]
    result_df["p_val"] = result_df["p_val"]/1000
    result_df.to_csv(output_path)

In [None]:
calculate_phenotype_enrichment(coms_multiplex.communities, os.path.join(COMS_DIR,'coms_multiplex_phenotype_enrichment.csv'), all_genes_background=False)

In [None]:
calculate_phenotype_enrichment(coms_multiplex.communities, os.path.join(COMS_DIR,'coms_multiplex_phenotype_enrichment_all.csv'), all_genes_background=True)


In [None]:
calculate_phenotype_enrichment(coms_ppi.communities, os.path.join(COMS_DIR,'coms_ppi_phenotype_enrichment.csv'), all_genes_background=False)


In [None]:
calculate_phenotype_enrichment(coms_ppi.communities, os.path.join(COMS_DIR,'coms_ppi_phenotype_enrichment_all.csv'), all_genes_background=True)


In [None]:
calculate_phenotype_enrichment(coms_phenotype.communities, os.path.join(COMS_DIR,'coms_phenotype_phenotype_enrichment.csv'), all_genes_background=False)


In [None]:
calculate_phenotype_enrichment(coms_phenotype.communities, os.path.join(COMS_DIR,'coms_phenotype_phenotype_enrichment_all.csv'), all_genes_background=True)
