Variables to be updated/configured:

In [None]:
WES = False # False if running for the larger epilepsy-autism multiplex network, True if running for the WES multiplex network
GRAPH_DIR = "./../gexf_files" # path to directory where the .gexf files are located
KB_DIR = './lib/Knowledgebase'  # path to Phen2Gene KB directory
SKEWNESS_DIR = './lib/skewness' # path to Phen2Gene directory with skewness weights
PHENOTYPES_DIR = "./../phenotypes" # path to directory containing .csv files with epilepsy and autism phenotypes

In [None]:
import os

from datetime import datetime
import mygene
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Convert gene symbol to entrez id

In [None]:
# nodes in the network
if WES:
    input_filename = 'genes_wes.txt'
else:
    input_filename = 'genes.txt'
    
df = pd.DataFrame(pd.read_csv(input_filename).iloc[:,0])
df.columns = ['gene_symbol']
gene_set = set(df['gene_symbol'])
print("Number of genes in the network:", len(gene_set))

In [None]:
symbol_to_entrezid = {
    'MEMO1': 51072,
    'SLCO1B7': 338821,
    'NOTCH2NL': 388677, 
    'ST5': 6764, 
    'MT-CO1': 4512, 
    'C7orf55': 154791, 
    'MT-CYB': 4519, 
    'APOPT1': 84334, 
    'LPHN2': 23266, 
    'FAM92B': 339145, 
    'C16orf62': 57020, 
    'WHSC1': 7468,
    'GUCY1A3': 2982, 
    'MT-ND1': 4535, 
    'COL4A3BP': 10087, 
    'C10orf2': 56652, 
    'SGK223': 157285, 
    'NGFRAP1': 27018, 
    'ATP5A1': 498,
    'CASC4': 113201,
    'GPR98': 84059,
    'TTC25': 83538,
    'MT-ND5': 4540,
    'KIAA1456': 57604,
    'QARS': 5859,
    'AARS': 16,
    'ZFYVE20': 64145,
    'GPR56': 9289,
    'MT-ATP8': 4509,
    'MT-CO3': 4514,
    'KARS': 3735,
    'MT-ATP6': 4508,
    'MT-ND4': 4538, 
    'LOR': 4014,
    'SSPO': 23145,
    'MSNP1AS': 4479,
    'PTCHD1-AS': 100873065
}
gene_set_all = gene_set
gene_set = [g for g in gene_set if g not in symbol_to_entrezid]

In [None]:
mg = mygene.MyGeneInfo()
result = mg.querymany(gene_set, scopes='symbol', species='human', returnall=True)

In [None]:
# no entrez id
for m in result['missing']:
    print(m)

In [None]:
temp = []
for g in result['out']:
    if 'notfound' in g:
        print('entrez not found:', g['query'])
        temp.append([g['query'], -1])
    elif 'entrezgene' not in g:
        print('entrezgene not available:', g['query'])
        temp.append([g['query'], -1])
    else:
        temp.append([g['query'], g['entrezgene']])

manual_matching = [[i, symbol_to_entrezid[i]] for i in symbol_to_entrezid if i in gene_set_all]
temp.extend(manual_matching)
symbol_to_id_df = pd.DataFrame(temp)
symbol_to_id_df.columns = ["gene_symbol", "entrez_id"]

In [None]:
symbol_to_id_df.to_csv(input_filename, index=False)

# Gene gene-phenotype associations for genes in the network

In [None]:
nodes_df = pd.read_csv(input_filename)
gene_set = set(nodes_df['entrez_id'])
gene_symbols = set(nodes_df['gene_symbol'])
print("Number of entrez IDs:", len(gene_set)) # 2 genes are -1 entrez id

In [None]:
n = len(os.listdir(KB_DIR))
print("Retrieving gene-phenotype associations from Phen2Gene knowledge base")
with open('temp.txt', 'w') as output:
    for idx, file in enumerate(os.listdir(KB_DIR)):

        if idx % 500 == 0:
            print(f"{idx}/{n}")

        hpo = file.replace('.candidate_gene_list', '')
        
        # loop through each line in candidate gene list file
        with open(os.path.join(KB_DIR, file)) as f:
            for i, line in enumerate(f):
                line = line.strip()
                
                if i == 0: # skip first line of file
                    if idx == 0:
                        splits = line.split("\t") + ["HPO"]
                        output.write("\t".join(splits) + "\n")
                    continue

                splits = line.split("\t") + [hpo]
                if int(splits[2].strip()) in gene_set:
                    output.write("\t".join(splits) + "\n")
                    

In [None]:
# multiply gene-phenotype association by skewness
df = pd.read_csv('temp.txt', sep="\t")
os.remove("temp.txt")

skewness_values = []
for file in os.listdir(SKEWNESS_DIR):
    with open(os.path.join(SKEWNESS_DIR, file), 'r') as f:
        hpo = file
        skewness = float(f.read().strip())
        skewness_values.append([hpo, skewness])
skewness_df = pd.DataFrame(skewness_values)
skewness_df.columns = ["HPO", "skewness"]

hpo_associations_df = df.merge(skewness_df, on="HPO", how="left")
hpo_associations_df['final_score'] = hpo_associations_df['Score'] * hpo_associations_df['skewness']
hpo_associations_df = hpo_associations_df.merge(nodes_df, left_on="ID", right_on="entrez_id")

if WES:
    hpo_associations_df.to_csv("hpo_association_scores_wes.csv", index=False)
else:
    hpo_associations_df.to_csv("hpo_association_scores.csv", index=False)

# Construct phenotype network

In [None]:
if WES:
    rank = 500
else:
    rank = 1000
alpha = 0.01

hpo_associations_df = hpo_associations_df[hpo_associations_df["Rank"] <= rank] # only use rank top genes per HPO ID
hpo_associations_df = hpo_associations_df.pivot(index='gene_symbol', columns='HPO', values='final_score')
hpo_associations_df = hpo_associations_df.fillna(0)
doc_term_matrix = hpo_associations_df.values

cs_comp = cosine_similarity(doc_term_matrix, doc_term_matrix) # actual cosine similarity between phenotype vectors of genes
num_trials = 1000
result = np.zeros((len(cs_comp), len(cs_comp)))
print('Shuffling phenotype vectors')
for n in range(0, num_trials):
    if n % 50 == 0:
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Trial", n, "Current Time =", current_time)
    np.random.shuffle(doc_term_matrix)
    cs = cosine_similarity(doc_term_matrix, doc_term_matrix) # cosine similarity between shuffled phenotype vectors of genes
    result = np.add(result, np.greater(cs, cs_comp))


genes = list(hpo_associations_df.index)
edges = []
alpha = alpha
n = len(cs_comp)
print('Adding edges to phenotype network')
for i in range(0, n):
    if i % 100 == 0:
        print(f"Node {i}/{n}")
    for j in range(i+1, n):
        thresh = int(num_trials * alpha) 
        if result[i, j] < thresh: # lower number of random shuffles with greater cosine similarity, compared to threshold
            edges.append((genes[i], genes[j]))

G_phenotype = nx.Graph()
G_phenotype.add_nodes_from(gene_symbols)
G_phenotype.add_edges_from(edges)
print(nx.info(G_phenotype))

if WES:
    nx.write_gexf(G_phenotype, os.path.join(GRAPH_DIR, "gene-phenotype-wes-1-500.gexf"))
else:
    nx.write_gexf(G_phenotype, os.path.join(GRAPH_DIR, "gene-phenotype-1-1000.gexf"))


# Generate gene-phenotype associations for all genes

In [None]:
epilepsy_phenotypes_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, "epilepsy_phenotypes.csv")) # epilepsy phenotypes (HPO subtree with root Autistic behavior HP:0000729)
autism_phenotypes_df = pd.read_csv(os.path.join(PHENOTYPES_DIR, "autism_phenotypes.csv")) # autism phenotypes (HPO subtree with root Seizure HP:0001250)
epilepsy_phenotypes = set(epilepsy_phenotypes_df['HPO'])
autism_phenotypes = set(autism_phenotypes_df['HPO'])
ea_phenotypes = epilepsy_phenotypes.union(autism_phenotypes)
ea_phenotypes = [i.replace(":", "_") for i in ea_phenotypes]

In [None]:
n = len(os.listdir(KB_DIR))
print("Retrieving gene-phenotype associations from Phen2Gene knowledge base")
with open('temp.txt', 'w') as output:
    output.write("Rank	Gene	ID	Score	Status	HPO\n")
    for idx, file in enumerate(os.listdir(KB_DIR)):

        if idx % 500 == 0:
            print(f"{idx}/{n}")

        hpo = file.replace('.candidate_gene_list', '')
        if hpo not in ea_phenotypes: # only need gene-phenotype associations for epilepsy/autism phenotypes
            continue
        
        # loop through each line in candidate gene list file
        with open(os.path.join(KB_DIR, file)) as f:
            for i, line in enumerate(f):
                line = line.strip()
                
                if i == 0: # skip first line of file
                    continue

                splits = line.split("\t") + [hpo]
                output.write("\t".join(splits) + "\n")

In [None]:
df = pd.read_csv('temp.txt', sep="\t")
os.remove("temp.txt")

skewness_values = []
for file in os.listdir(SKEWNESS_DIR):
    with open(os.path.join(SKEWNESS_DIR, file), 'r') as f:
        hpo = file
        if hpo not in ea_phenotypes:
            continue
        skewness = float(f.read().strip())
        skewness_values.append([hpo, skewness])
skewness_df = pd.DataFrame(skewness_values)
skewness_df.columns = ["HPO", "skewness"]

hpo_associations_df = df.merge(skewness_df, on="HPO", how="left")
hpo_associations_df['final_score'] = hpo_associations_df['Score'] * hpo_associations_df['skewness']
hpo_associations_df.to_csv("hpo_association_scores_all.csv", index=False)