Variables to be updated/configured:

In [None]:
WES = False # False if running for the larger epilepsy-autism multiplex network, True if running for the WES multiplex network

if WES:
    FIGURES_DIR = "figures_wes" # path to directory where figures will the saved (creates the directory if it doesn't exist)
    COMS_DIR = "communities_wes" # path to directory containing information on the communities in the network
else:
    FIGURES_DIR = "figures" # path to directory where figures will the saved (creates the directory if it doesn't exist)
    COMS_DIR = "communities" # path to directory containing information on the communities in the network
    
GRAPH_DIR = "gexf_files" # path to directory where the .gexf files are located
PHENOTYPES_DIR = "phenotypes" # path to directory containing .csv files with epilepsy and autism phenotypes
TABLES_DIR = "tables" # path to directory containing .csv files representing tables (creates the directory if it doesn't exist)

In [None]:
# network packages
import networkx as nx
from networkx.readwrite.gexf import read_gexf

# visualization packages
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# other packages
from math import floor, log10
import os
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import fdrcorrection

# packages for calculating linkage
import scipy
from itertools import combinations
from scipy.cluster.hierarchy import linkage

In [None]:
font = {'size': 14}
matplotlib.rc('font', **font)

In [None]:
if not os.path.exists(TABLES_DIR):
    os.makedirs(TABLES_DIR)
if not os.path.exists(FIGURES_DIR):
    os.makedirs(FIGURES_DIR)

# Setup

In [None]:
if WES:
    gene_phenotype_filename = 'gene-phenotype-wes-1-500-update.gexf'
    gene_ppi_filename = "gene-ppi-wes-700-update.gexf"
    gene_union_filename = 'gene-union-wes.gexf'
else:
    gene_phenotype_filename = 'gene-phenotype-1-1000-update.gexf'
    gene_ppi_filename = 'gene-ppi-700-update.gexf'
    gene_union_filename = 'gene-union.gexf'
    
gene_phenotype = read_gexf(os.path.join(GRAPH_DIR, gene_phenotype_filename))
gene_ppi = read_gexf(os.path.join(GRAPH_DIR, gene_ppi_filename))
gene_union = read_gexf(os.path.join(GRAPH_DIR, gene_union_filename))


In [None]:
# wrapper for communities
class Coms:
    def __init__(self, communities):
        self.communities = communities
        self.overlap = None
        
# get Coms class with genes from annotated networkx graph
def get_coms_from_graph(G):    
    max_module = max([G.nodes[node]['module'] for node in G.nodes])
    partition = []
    for i in range(max_module):
        partition.append([])
    for node in G.nodes:
        mod = G.nodes[node]['module']
        partition[mod-1] = partition[mod-1] + [node]
    coms = Coms(partition)
    return coms

# takes partition with IDs and converts to Coms class with genes
def partition_to_genes(partition):
    partition_genes = []
    for com in partition:
        com_genes = []
        for g in com:
            com_genes.append(id_to_gene[g])
        partition_genes.append(com_genes) 
    coms = Coms(list(partition_genes))
    return coms

In [None]:
coms_ppi = get_coms_from_graph(gene_ppi)
coms_phenotype = get_coms_from_graph(gene_phenotype)
coms_multiplex = get_coms_from_graph(gene_union)

In [None]:
epilepsy_phenotypes_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, "epilepsy_phenotypes.csv"))
autism_phenotypes_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, "autism_phenotypes.csv"))
epilepsy_phenotypes = set(epilepsy_phenotypes_df['HPO'])
autism_phenotypes = set(autism_phenotypes_df['HPO'])

print(f"{len(epilepsy_phenotypes)} HPO terms under seizure (HP:0001250)")
print(f"{len(autism_phenotypes)} HPO terms under autistic behavior (HP:0000729)")

# Plot phenotype enrichment

In [None]:
def neg_log_p_val_label(x, vmax):
    if vmax and x >= vmax:
        return "****"
    elif x > -np.log10(0.01):
        return "***"
    elif x > -np.log10(0.05):
        return "**"
    elif x > -np.log10(0.1):
        return "*"
    else:
        return ""

def plot_enrichment_w_linkage(enrichment_df, mod_sizes, filename, vmax=None, row_linkage=None, figsize=(20, 10)):

    sns.set(font_scale=0.8)

    num_mods = len(enrichment_df.columns)
    plt.figure(figsize=figsize)

    xticklabels = []
    for i in range(1, num_mods+1):
        xticklabels.append(f'{str(i)}\n({mod_sizes[i-1]})')
    
    labels_df = enrichment_df.applymap(lambda x: neg_log_p_val_label(x, vmax))

    cmap = "Blues"
    g = sns.clustermap(enrichment_df, row_linkage=row_linkage, col_cluster=False, col_linkage=None, annot=labels_df, fmt="", yticklabels = list(enrichment_df.index), xticklabels = xticklabels, cbar_kws={'label': '-log10(FDR)'}, cmap=cmap, vmin=0, vmax=vmax)
    
    ax = g.ax_heatmap
    
    colorbar = ax.collections[0].colorbar
    if vmax:
        colorbar.set_ticks(list(np.arange(0, vmax, max(int(vmax/3)-int(vmax/5), 1))) + [vmax])
        colorbar.set_ticklabels(list(np.arange(0, vmax, max(int(vmax/4)-int(vmax/7), 1))) + [str(round(vmax, 3)) + "+"])
    
    ax.set_xlabel('Module\n(size)')
    ax.set_ylabel('HPO')

    g.savefig(filename, dpi=600)

    plt.show()

In [None]:
def plot_enrichment(enrichment_df, mod_sizes, filename, vmax=None):

    num_mods = len(enrichment_df.columns)
    plt.figure(figsize=(16,12))

    xticklabels = []
    for i in range(1, num_mods+1):
        xticklabels.append(f'{str(i)}\n({mod_sizes[i-1]})')
    
    labels_df = enrichment_df.applymap(lambda x: neg_log_p_val_label(x, vmax))

    cmap = "Blues"
    ax = sns.heatmap(enrichment_df, annot=labels_df, fmt="", xticklabels = xticklabels, cbar_kws={'label': '-log10(FDR)'}, cmap=cmap, vmin=0, vmax=vmax)
    colorbar = ax.collections[0].colorbar
    
    if vmax:
        colorbar.set_ticks(list(np.arange(0, vmax, 1)) + [vmax])
        colorbar.set_ticklabels(list(np.arange(0, vmax, 1)) + [str(round(vmax, 3)) + "+"])

    plt.xlabel('Module\n(size)')
    plt.ylabel('Gene group')
    plt.tight_layout()

    plt.savefig(filename, dpi=600)

    plt.show()

In [None]:
def get_enrichments_matrix_phenotype(df, labels=sorted(list(epilepsy_phenotypes)) + sorted(list(autism_phenotypes))):
    
    enrichments = df    
    fdr_list = []
    for mod_num in range(1, max(df['module'])+1):
        rejected, fdr = fdrcorrection(list(enrichments[enrichments['module']==mod_num]['p_val'])) # FDR correction
        fdr_list = fdr_list + list(fdr)

    enrichments['p_adjusted'] = fdr_list
    enrichments['neg_log_pval'] = -np.log10(enrichments['p_adjusted'])
    
    temp = []
    for i, label in enumerate(labels):
        pvals = list(enrichments[enrichments['HPO']==label].sort_values(by='module')['neg_log_pval'])
        temp.append(pvals)

    enrichment_df = pd.DataFrame(temp)
    enrichment_df.index = labels
    return enrichment_df


In [None]:
def get_dendrogram(df, root, hpos):
    df = df.drop_duplicates()
    df = df.merge(df, left_on='HPO', right_on='parent', how='left').drop('parent_y', axis=1)
    df.columns = ['HPO', 'parent', 'children']
    leaves = sorted(list(set(df[pd.isna(df['children'])]['HPO'])))
    df = df.fillna(0)
    df = df.groupby(['HPO', 'parent'])['children'].apply(lambda x: sorted(list(x))).reset_index()

    hpo = set(df['HPO'])
    non_leaves = (hpo.difference(set(leaves)))
    if root in non_leaves:
        non_leaves.remove(root)

    hpo_to_id = {}
    for i, h in enumerate(leaves):
        hpo_to_id[h] = i

    for i, h in enumerate(non_leaves, len(leaves)):
        hpo_to_id[h] = i

    hpo_to_id[root] = len(hpo) - 1

    hpo_to_children = {}
    for i, row in df.iterrows():
        hpo_to_children[row['HPO']] = row['children']

    # create tree
    G = nx.Graph()
    q = [root]
    while q:
        node = q.pop()
        children = hpo_to_children[node]
        children = [c for c in children if c]
        for child in children:
            G.add_edge(hpo_to_id[child], hpo_to_id[node])
        q = q + children

    # get distances between HPO IDs in tree to generate dendrogram
    n = len(G.nodes)
    dmat = np.zeros((n, n))
    for l1, l2 in combinations(list(G.nodes), 2): #hpo_leaves
        res = nx.shortest_path(G, source=l1, target=l2)
        dmat[l1, l2] = len(res) - 1
        dmat[l2, l1] = len(res) - 1
        
    if 'HP:0001250' in hpo_to_id and 'HP:0000729' in hpo_to_id:
        dmat[hpo_to_id['HP:0001250'], hpo_to_id['HP:0000729']] = 1000
        dmat[hpo_to_id['HP:0000729'], hpo_to_id['HP:0001250']] = 1000
        
    n = len(hpos)
    dmat_updated = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            dmat_updated[i, j] = dmat[hpo_to_id[hpos[i]], hpo_to_id[hpos[j]]]
            dmat_updated[j, i] = dmat[hpo_to_id[hpos[j]], hpo_to_id[hpos[i]]]
    
    schlink = linkage(scipy.spatial.distance.squareform(dmat_updated), method='average', metric='euclidean')
    return schlink

In [None]:
def format_hpo_name(df, labels_df):
    df = df.merge(labels_df, left_index=True, right_on='HPO', how='left').fillna('')
    df.index = df['HPO'] + " " + df['HPO_name']
    df = df.drop(['HPO', 'HPO_name'], axis=1)
    return df

In [None]:
def round_sig(x, sig=3):
    return round(x, sig-int(floor(log10(abs(x))))-1)

# Phenotype enrichment (experimental) in multiplex network

In [None]:
com_ppi_sizes = [len(com) for com in coms_ppi.communities]
com_phenotype_sizes = [len(com) for com in coms_phenotype.communities]
com_multiplex_sizes = [len(com) for com in coms_multiplex.communities]

In [None]:
# enrichment using experimental p-value
if WES:
    TOP_MODULES = 13    
else:
    TOP_MODULES = 14
    
coms_multiplex_enrichment_df = pd.read_csv(os.path.join(COMS_DIR, 'coms_multiplex_phenotype_enrichment.csv'))
coms_multiplex_enrichment_df_all = pd.read_csv(os.path.join(COMS_DIR, 'coms_multiplex_phenotype_enrichment_all.csv'))


In [None]:
mod_sizes = com_multiplex_sizes
enrichment_df = get_enrichments_matrix_phenotype(coms_multiplex_enrichment_df).iloc[:,:TOP_MODULES].dropna()
enrichment_df_all = get_enrichments_matrix_phenotype(coms_multiplex_enrichment_df_all).iloc[:,:TOP_MODULES].dropna()

# cap the enrichment value at the max value if it is np.inf
vmax = np.amax(enrichment_df.replace(np.inf, -1).values)
enrichment_df = enrichment_df.replace(np.inf, vmax)
vmax_all = np.amax(enrichment_df_all.replace(np.inf, -1).values)
enrichment_df_all = enrichment_df_all.replace(np.inf, vmax_all)


In [None]:
# get the row linkage dendrogram
df = epilepsy_phenotypes_df.append(autism_phenotypes_df).drop_duplicates()
df = df.append({'HPO': 'root', 'parent': 'head'}, ignore_index=True)
row_linkage = get_dendrogram(df[['HPO', 'parent']], root='root', hpos=list(enrichment_df.index))
row_linkage_all = get_dendrogram(df[['HPO', 'parent']], root='root', hpos=list(enrichment_df_all.index))

# update heatmap labels
hpo_labels_df = df[['HPO', 'HPO_name']].drop_duplicates()
enrichment_df = format_hpo_name(enrichment_df, hpo_labels_df)
enrichment_df_all = format_hpo_name(enrichment_df_all, hpo_labels_df)

In [None]:
plot_enrichment_w_linkage(enrichment_df, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_multiplex.png'), vmax=vmax, row_linkage=row_linkage)

In [None]:
plot_enrichment_w_linkage(enrichment_df_all, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_multiplex_all.png'), vmax=vmax_all, row_linkage=row_linkage_all)

# Phenotype enrichment (JAX gene-phenotype associations) in multiplex network

In [None]:
# hpo jax enrichment epilepsy autism HPO
if WES:
    TOP_MODULES = 13

else:
    TOP_MODULES = 14
    
# from HPO jax
coms_multiplex_enrichment_df = pd.read_csv(os.path.join(COMS_DIR, 'coms_multiplex_enrichment.csv')).fillna(1)
coms_multiplex_enrichment_df = coms_multiplex_enrichment_df[coms_multiplex_enrichment_df['label'].str.contains("HP:")]
coms_multiplex_enrichment_df = coms_multiplex_enrichment_df.rename(columns={'label': 'HPO', 'pval': 'p_val'})

coms_multiplex_enrichment_df_all = pd.read_csv(os.path.join(COMS_DIR, 'coms_multiplex_enrichment_all_genes.csv')).fillna(1)
coms_multiplex_enrichment_df_all = coms_multiplex_enrichment_df_all[coms_multiplex_enrichment_df_all['label'].str.contains("HP:")]
coms_multiplex_enrichment_df_all = coms_multiplex_enrichment_df_all.rename(columns={'label': 'HPO', 'pval': 'p_val'})
    
enrichment_df = get_enrichments_matrix_phenotype(coms_multiplex_enrichment_df).iloc[:,:TOP_MODULES].dropna()
enrichment_df_all = get_enrichments_matrix_phenotype(coms_multiplex_enrichment_df_all).iloc[:,:TOP_MODULES].dropna()

In [None]:
# remove rows with all zero enrichment
enrichment = enrichment_df[enrichment_df.index.isin(epilepsy_phenotypes.union(autism_phenotypes))]
enrichment = enrichment.replace(0, np.nan).dropna(axis=0, how="all").replace(np.nan, 0)
enrichment_all = enrichment_df_all[enrichment_df_all.index.isin(epilepsy_phenotypes.union(autism_phenotypes))]
enrichment_all = enrichment_all.replace(0, np.nan).dropna(axis=0, how="all").replace(np.nan, 0)

In [None]:
# get the row linkage dendrogram
df = epilepsy_phenotypes_df.append(autism_phenotypes_df).drop_duplicates()
df = df.append({'HPO': 'root', 'parent': 'head'}, ignore_index=True)
row_linkage = get_dendrogram(df[['HPO', 'parent']], root='root', hpos=list(enrichment.index))
row_linkage_all = get_dendrogram(df[['HPO', 'parent']], root='root', hpos=list(enrichment_all.index))

# update heatmap labels
hpo_labels_df = df[['HPO', 'HPO_name']].drop_duplicates()
enrichment = format_hpo_name(enrichment, hpo_labels_df)
enrichment_all = format_hpo_name(enrichment_all, hpo_labels_df)

In [None]:
plot_enrichment_w_linkage(enrichment, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_multiplex_jax.png'), row_linkage=row_linkage)


In [None]:
plot_enrichment_w_linkage(enrichment_all, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_multiplex_jax_all.png'), row_linkage=row_linkage_all)


# Top enriched HPO (all HPO) in each module using JAX gene-phenotype associations
Part of Supplemental Tables 2 and 3

In [None]:
# this cell takes several minutes to run
if WES:
    TOP_MODULES = 13
else:
    TOP_MODULES = 14

coms_multiplex_enrichment_df_all = pd.read_csv(os.path.join(COMS_DIR, 'coms_multiplex_enrichment_all_hpo_all_genes.csv')).dropna()
coms_multiplex_enrichment_df_all = coms_multiplex_enrichment_df_all[coms_multiplex_enrichment_df_all['label'].str.contains("HP:")]
coms_multiplex_enrichment_df_all = coms_multiplex_enrichment_df_all.rename(columns={'label': 'HPO', 'pval': 'p_val'})

labels_all = sorted(list(set(coms_multiplex_enrichment_df_all['HPO'])))
enrichment_df_all = get_enrichments_matrix_phenotype(coms_multiplex_enrichment_df_all, labels=labels_all).iloc[:,:TOP_MODULES].dropna()

In [None]:
top_hpo_in_modules = []
for mod_num in range(1, TOP_MODULES + 1):
    top_hpo = coms_multiplex_enrichment_df_all[coms_multiplex_enrichment_df_all['module']==mod_num].sort_values(by='p_adjusted').head(10)
    hpos = list(top_hpo['HPO'])
    hpo_name = list(top_hpo['phenotype_name'])
    pvals = list(top_hpo['p_adjusted'])
    
    hpo_list = []
    for i in range(len(hpos)):
        hpo_list.append(f"{hpo_name[i]} ({hpos[i]}) (FDR={round_sig(pvals[i], 3)})")
        
    top_hpo_in_modules.append("; ".join(hpo_list))
top_modules_df_all = pd.DataFrame(top_hpo_in_modules, columns=["top_hpo"])
top_modules_df_all['module'] = np.arange(1, TOP_MODULES + 1)
if WES:
    top_modules_df_all.to_csv(os.path.join(TABLES_DIR, 'top_hpo_per_module_all_genes_wes.csv'), index=False)    
else:
    top_modules_df_all.to_csv(os.path.join(TABLES_DIR, 'top_hpo_per_module_all_genes.csv'), index=False)

# Phenotype enrichment (experimental) in PPI network layer

In [None]:
if WES:
    TOP_MODULES = 13    
else:
    TOP_MODULES = 17

coms_enrichment_df = pd.read_csv(os.path.join(COMS_DIR, 'coms_ppi_phenotype_enrichment.csv'))
coms_enrichment_df_all = pd.read_csv(os.path.join(COMS_DIR, 'coms_ppi_phenotype_enrichment_all.csv'))
  
enrichment_df = get_enrichments_matrix_phenotype(coms_enrichment_df).iloc[:,:TOP_MODULES].dropna()
enrichment_df_all = get_enrichments_matrix_phenotype(coms_enrichment_df_all).iloc[:,:TOP_MODULES].dropna()
mod_sizes = com_phenotype_sizes

vmax = np.amax(enrichment_df.replace(np.inf, -1).values)
enrichment_df = enrichment_df.replace(np.inf, vmax)
vmax = np.amax(enrichment_df_all.replace(np.inf, -1).values)
enrichment_df_all = enrichment_df_all.replace(np.inf, vmax)

In [None]:
plot_enrichment(enrichment_df, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_ppi.png'), vmax=vmax)

In [None]:
plot_enrichment(enrichment_df_all, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_ppi_all.png'), vmax=vmax)

# Phenotype enrichment (experimental) in phentype network layer

In [None]:
if WES:
    TOP_MODULES = 13    
else:
    TOP_MODULES = 18
    
coms_enrichment_df = pd.read_csv(os.path.join(COMS_DIR, 'coms_phenotype_phenotype_enrichment.csv'))
coms_enrichment_df_all = pd.read_csv(os.path.join(COMS_DIR, 'coms_phenotype_phenotype_enrichment_all.csv'))

enrichment_df = get_enrichments_matrix_phenotype(coms_enrichment_df).iloc[:,:TOP_MODULES].dropna()
enrichment_df_all = get_enrichments_matrix_phenotype(coms_enrichment_df_all).iloc[:,:TOP_MODULES].dropna()
mod_sizes = com_ppi_sizes

vmax = np.amax(enrichment_df.replace(np.inf, -1).values)
enrichment_df = enrichment_df.replace(np.inf, vmax)
vmax = np.amax(enrichment_df_all.replace(np.inf, -1).values)
enrichment_df_all = enrichment_df_all.replace(np.inf, vmax)

In [None]:
plot_enrichment(enrichment_df, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_phenotype.png'), vmax=vmax)

In [None]:
plot_enrichment(enrichment_df_all, mod_sizes, os.path.join(FIGURES_DIR, 'phenotype_enrichment_phenotype_all.png'), vmax=vmax)