Variables to be updated/configured:

In [None]:
WES = False # False if running for the larger epilepsy-autism multiplex network, True if running for the WES multiplex network
GRAPH_DIR = "./../gexf_files" # path to directory where the .gexf files are located
GENE_SETS_DIR = "./../gene_sets" # path to directory containing .csv files with genes sets

In [None]:
import os
import networkx as nx
import pandas as pd
import seaborn as sns

# Create the protein-protein interaction (PPI) network

In [None]:
protein_interactions_df = pd.read_csv("9606.protein.links.v11.0.txt", sep=' ')

### Filter the PPI network to only use edges with a weight >= 700 (high confidence)

In [None]:
THRESHOLD = 700
ppi_filtered = protein_interactions_df[(protein_interactions_df["combined_score"] >= THRESHOLD)]

### Merge PPI interactions with gene names

In [None]:
# replace gene names with the most updated one
def update_genes(df, gene_col):
    new_df = pd.DataFrame()
    for i, row in df.iterrows():
        replacements = {
            "ND1": "MT-ND1",
            "ND4": "MT-ND4",
            "TRNR1": "GFRA1",
            "CCM1": "KRIT1",
            "C19orf61": "SMG9",
            "EIF2C4": "AGO4",
            "HOXD": ["HOXD1", "HOXD3", "HOXD4", "HOXD8", "HOXD9", "HOXD10", "HOXD11", "HOXD12", "HOXD13"],
            "ATP6": "MT-ATP6",
            "APOE4": "APOE",
            "ENSG00000173575": "CHD2",
            "SCA2": "ATXN2",
            "B3GNT1": "B4GAT1",
            "COX3": "MT-CO3",
            "ENSG00000086848": "ALG9",
            "ATP8": "MT-ATP8",
            "ND5": "MT-ND5",
            "C2orf25": "MMADHC",
            "PIG6": "PRODH",
            "ENSG00000258947": "TUBB3",
            "ADCK3": "COQ8A", 
            "COX1": "MT-CO1",
            "DXS423E": "SMC1A",
            "PCDHG": ["PCDHGA1", "PCDHGA2", "PCDHGA3", "PCDHGA4", "PCDHGA5", "PCDHGA6", "PCDHGA7", "PCDHGA8", "PCDHGA9", "PCDHGA10", "PCDHGA11", "PCDHGA12", \
                      "PCDHGB1", "PCDHGB2", "PCDHGB3", "PCDHGB4", "PCDHGB5", "PCDHGB6", "PCDHGB7", "PCDHGC3", "PCDHGC4", "PCDHGC5"],
            "KIAA0226": "RUBCN",
            "CYTB": "MT-CYB",
            "KIAA0442": "AUTS2",
            "KAL1": "ANOS1",
            "BRP44L": "MPC1",
            "KIAA1715": "LNPK",
            "JMJD2C": "KDM4C",
            "CCDC64": "BICDL1" ,
            "KIAA2022": "NEXMIF",
            "INADL": "PATJ",
            "PIG6": "PRODH",
            "PARK2": "PRKN",
            "NDNL2": "NSMCE3",
            "BZRAP1": "TSPOAP1",
            "ERBB2IP": "ERBIN",
            "HIST1H2BJ": "H2BC11",
            "ADSS": "ADSS2",
            "C15orf43": "TERB2",
            "C16orf13": "METTL26",
            "C11orf30": "EMSY",
            "SUV420H1": "KMT5B",
            "MKL2": "MRTFB",
            "ENSG00000259159": "MFRP",
            "MARCA2":"SMARCA2",
            "C11orf82": "DDIAS",
            "CSNK2B-LY6G5B-1181": "CSNK2B",
            'TCAF1': 'FAM115A', 
            'KCNMB2': 'ENSG00000275163', 
            'KIAA1009': 'CEP162', 
            'AGMO(alsoknownasTMEM195)': 'AGMO', 
            'PPIEL': 'PPIEL', 
            'GGTA1P': 'GGTA1', 
            'KIAA1239': 'NWD2', 
            'LINC01370': 'LINC01370', 
            'PCDHA@': ['PCDHA10', 'PCDHA9', 'PCDHA5', 'PCDHA11', 'PCDHA7', 'PCDHA3', 'PCDHA8', 'PCDHA2', 'PCDHA1', 'PCDHA13', 'PCDHA4', 'PCDHA6', 'PCDHA12'], 
            'MsrA': 'MSRA', 
            'DGCR6': 'ENSG00000183628', 
            'ZNF259': 'ZPR1', 
            'ADGRA2': 'GPR124', 
            'KIAA1430': 'CFAP97',     
            'RNASE4': 'ENSG00000258818', 
            'C14orf166B': 'LRRC74A', 
            "RP11-1055B8.7": "BAHCC1",
            "ENSG00000272414": "FAM47E-STBD1",
            "C5orf20": "DCANP1",
            "SOGA2": "MTCL1",
            "FAM194A": "ERICH6"
        }
        gene = row[gene_col]
        if gene in replacements:
            replacement = replacements[gene]
            if isinstance(replacement, str):
                print("Replaced", gene, "with", replacement)
                row[gene_col] = replacement
                new_df = new_df.append(row)
            else:
                for j in range(len(replacement)):
                    print("Replaced", gene, "with", replacement[j])
                    row[gene_col] = replacement[j]
                    new_df = new_df.append(row)
        else:
            new_df = new_df.append(row)
                    
    return new_df

In [None]:
info_df = pd.read_csv("9606.protein.info.v11.0.txt", sep='\t')
info_df = update_genes(info_df, "preferred_name")

In [None]:
PPI_all_genes = set(info_df['preferred_name'])
print("There are", str(len(PPI_all_genes)), "nodes in the PPI network")

In [None]:
id_to_protein = info_df[["protein_external_id", "preferred_name"]]
interactions_df = ppi_filtered.merge(id_to_protein, left_on="protein1", right_on="protein_external_id")
interactions_df = interactions_df.merge(id_to_protein, left_on="protein2", right_on="protein_external_id")
interactions_df = interactions_df[["preferred_name_x", "preferred_name_y", "combined_score"]]
interactions_df.columns = ["protein1", "protein2", "combined_score"]

# Get epilepsy-associated genes
### Epilepsy-assoicated genes are from Wang et al. (2017)
### https://www.sciencedirect.com/science/article/pii/S1059131116302989

In [None]:
if WES:
    epilepsy_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "WES_EPI_gene_burden_AC_1_Epi25_Collaborative_2019.csv"), skiprows=2, nrows=200)
    epilepsy_df = update_genes(epilepsy_df, "Gene")
    epilepsy_genes = set(epilepsy_df["Gene"])
else:
    epilepsy_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "epilepsy_genes_wang_2017_formatted.csv"))
    epilepsy_df = update_genes(epilepsy_df, "gene")
    epilepsy_genes = set(epilepsy_df["gene"])

In [None]:
# epilepsy genes not in STRING
epilepsy_genes_not_found = set()
for i in epilepsy_genes:
    if not i in PPI_all_genes:
        print(i)
        epilepsy_genes_not_found.add(i)

# Get autism-associated genes
### Autism-assoicated genes are from SFARI Jan 3, 2020 release

In [None]:
if WES:
    autism_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "WES_autism_Satterstrom_2020.csv"), nrows=102)
    autism_df = update_genes(autism_df, "gene")
    autism_genes = set(autism_df["gene"])
else:
    autism_df = pd.read_csv(os.path.join(GENE_SETS_DIR, "SFARI-Gene_genes_01-03-2020release_01-05-2020export.csv"))
    autism_df = update_genes(autism_df, "gene-symbol")
    autism_genes = set(autism_df["gene-symbol"])

In [None]:
# autism genes not in STRING
autism_genes_not_found = set()
for i in autism_genes:
    if not i in PPI_all_genes:
        print(i)
        autism_genes_not_found.add(i)

# Create the epilepsy-autism PPI network

In [None]:
# creates a PPI network using NetworkX given a list of genes 
def create_network(gene_list):
        
    all_genes_df = pd.DataFrame(gene_list)
    all_genes_df.columns = ['gene']
        
    temp = interactions_df.merge(all_genes_df, left_on="protein1", right_on="gene").drop("gene", axis=1)
    ea_interactions_df = temp.merge(all_genes_df, left_on="protein2", right_on="gene").drop("gene", axis=1)
    ea_interactions_df = ea_interactions_df[ea_interactions_df["protein1"] != ea_interactions_df["protein2"]]
    
    G = nx.Graph()
    G.add_nodes_from(all_genes_df['gene'])
    for index, row in ea_interactions_df.iterrows():
        if row['combined_score'] > 0:
            G.add_edge(row['protein1'], row['protein2'])
            
    
    print("There are", len(G.nodes), "nodes in the PPI network")
    print("There are", len(G.edges), "edges in the PPI network")
    
    return G

In [None]:
all_genes = epilepsy_genes.union(autism_genes)
common_genes = epilepsy_genes.intersection(autism_genes)

print(f"Total number of genes {len(all_genes)}")
print(f"Number of epilepsy genes {len(epilepsy_genes)}")
print(f"Number of autism genes {len(autism_genes)}")
print(f"Number of common genes {len(common_genes)}")

In [None]:
# create epilepsy-autism PPI network
G = create_network(all_genes)
if WES:
    nx.write_gexf(G, os.path.join(GRAPH_DIR, "gene-ppi-wes-700.gexf"))
else:
    nx.write_gexf(G, os.path.join(GRAPH_DIR, "gene-ppi-700.gexf"))

In [None]:
print(nx.info(G))

In [None]:
# export list of genes used to generate phenotype network
if WES:
    output_filename = 'genes_wes.txt'
else:
    output_filename = 'genes.txt'
    
with open(output_filename, 'w') as f:
    f.write('gene_symbol' + '\n')
    for n in G.nodes:
        f.write(n + "\n")