# Supplemental Notebook - Upload Colocalized Networks to NDEx

### Set Up

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import sys
import ndex2 as ndex
from tqdm import tqdm 
from ndex2.cx2 import RawCX2NetworkFactory, CX2Network
sys.path.append('../carva')
from network_utils import *
from geneset_utils import *
from hierarchy_utils import *
from statsmodels.stats.multitest import fdrcorrection

In [6]:
template_uuid = '8e9be4c5-246a-11f0-9806-005056ae3c32'
pcnet_uuid_old = 'd73d6357-e87b-11ee-9621-005056ae23aa'
pcnet_uuid = '8b4b54fa-e87d-11ee-9621-005056ae23aa'

In [None]:
from getpass import getpass
username=getpass('Username:')
password=getpass('Password:')
client = ndex.client.Ndex2(username=username, password=password)

In [None]:
cwd = os.getcwd()
datadir = os.path.join(cwd, '..')
z_dir = os.path.join(datadir, 'outputs')
genelist_dir = os.path.join(datadir, 'outputs')
coloc_df = pd.read_csv(os.path.join(datadir, 'outputs/STable2.tsv'), sep='\t')
best_df = coloc_df[coloc_df['Optimal COLOC']].copy()

best_df['bonf_q'] = fdrcorrection(best_df['COLOC p'].values)[1]

In [4]:
#new file locations:
cwd = os.getcwd()
datadir = os.path.join(cwd, '..')
z_dir = os.path.join(datadir, 'out/pcnet2-entrez/lupus')
genelist_dir = os.path.join(datadir, 'data')


In [None]:
trait_info = best_df[(best_df.bonf_q < 0.01) & (best_df.Log2SizeOE > 0)]

In [None]:
trait_info.shape

## Load PCNet2.0

In [7]:
G = load_network(pcnet_uuid)
mapping = {x[0]: int(x[1]['GeneID']) for x in G.nodes(data=True)}
H = nx.relabel_nodes(G, mapping, copy=True)

Network Name:PCNet 2.2
Number of nodes: 18558
Number of edges: 3323928


In [8]:
mapping_edge = nx.get_edge_attributes(H, 'Number_of_Supporting_Databases')
mapping_edge = {e:int(x) for e, x in mapping_edge.items()}

In [9]:
nx.set_edge_attributes(H, mapping_edge, name='Number_of_Supporting_Databases')

In [11]:
print(type(H))

<class 'networkx.classes.graph.Graph'>


Add gene symbols

In [None]:
pcnet_node_map = pd.read_csv(os.path.join(datadir, 'outputs/pcnet_node_map.txt'), sep='\t')

In [12]:
# my version for node map file (also, their file is in datadir/Reference-data, not outputs):
pcnet_node_map = pd.read_csv(os.path.join(datadir, 'networks_data/pcnet2_entrez/pcnet2_node_map.txt'), sep='\t')

In [13]:
nx.set_node_attributes(H, pcnet_node_map['Symbol'].to_dict(), name='HGNC')

## Subnetwork creation functions

In [17]:
def get_z_scores(trait, z_dir):
    z_rare = load_z(trait[0], z_dir, transform='neglog10', norm='sum', rorc='R').dropna()
    z_common = load_z(trait[1], z_dir, transform='neglog10', norm='sum', rorc='C').dropna()
    z_df = z_rare.join(z_common, how='inner', lsuffix='_R', rsuffix='_C')
    z_df['Z_coloc'] = z_df['z_R'] * z_df['z_C']
    z_df = z_df.reset_index()
    z_df = z_df.rename(columns={'index':'Entrez'})
    z_df['Entrez'] = z_df['Entrez'].astype(int)
    if any(z_df['Entrez'].value_counts() > 1 ):
        print('DUPLICATED ENTREZ 1!!!')
        counts = z_df.Entrez.value_counts()
        dupes = counts[counts>1].index.values
        print(z_rare.loc[dupes])
        print(z_common.loc[dupes])
        print(z_df.loc[z_df.Entrez.isin(dupes)])
    return z_df

def load_z(trait, datadir, transform, norm, rorc):
    df = pd.read_csv(os.path.join(datadir, f'{trait}_z_{rorc}V_q_{transform}_{norm}.tsv'), sep='\t', header=None,
                    index_col=0, names=['z']).dropna()
    return df

def load_genelists(trait, genelist_dir, rorc):
    df = pd.read_csv(os.path.join(genelist_dir, f'{trait}_{rorc}V.txt'), sep='\t')
    df = df.rename(columns={'P-value': f'pval_{rorc}', 'Gene Symbol': f'symbol_{rorc}'})
    df = df.sort_values(by=f'pval_{rorc}', ascending=True)
    df  =df.drop_duplicates(subset=['Entrez'])
    return df.loc[:, ('Entrez', f'pval_{rorc}', f'symbol_{rorc}')]

def assign_gene_classes(all_genes):
    all_genes['input_gene'] = ~all_genes.pval_R.isna() | ~all_genes.pval_C.isna() 
    all_genes['rare'] = ~all_genes.pval_R.isna()
    all_genes['common'] = ~all_genes.pval_C.isna()
    all_genes['shared'] = all_genes['common'] & all_genes['rare']
    all_genes['gene_class'] = all_genes.apply(lambda x: 'shared' if (x['common'] & x['rare']) else 'common' if x['common'] else 'rare', axis=1)
    return all_genes

def combine_input_z_info(all_genes, z_df):
    z_df = z_df.merge(all_genes, on=['Entrez'], how='left')
    """
    the following code is deprecated in current pandas version:

    with pd.option_context("future.no_silent_downcasting", True):

        z_df = z_df.fillna({'input_gene': False}).infer_objects(copy=False)
        z_df = z_df.fillna({'gene_class': 'Network'}).infer_objects(copy=False)
    
    i replaced it with:
    """

    z_df['input_gene'] = z_df['input_gene'].fillna(False).astype(bool)
    z_df['gene_class'] = z_df['gene_class'].fillna('Network').astype(str)

    if any(z_df['Entrez'].value_counts() > 1 ):
        print('DUPLICATED ENTREZ 2!!!')
        counts = z_df.Entrez.value_counts()
        dupes = counts[counts>1].index.values
        print(z_rare.loc[dupes])
        print(z_common.loc[dupes])
        print(all_genes.loc[all_genes.Entrez.isin(dupes)])
        print(z_df.loc[z_df.Entrez.isin(dupes)])
    return z_df

def define_colocalized_network(z_df, zth=1, zzth=3):
    z_df['coloc_gene'] = z_df.apply(lambda x: 1 if x['Z_coloc'] > zzth and x['z_R'] > zth and x['z_C'] > zth else 0, axis=1)
    all_df = z_df[(z_df['coloc_gene']==1) | (z_df['input_gene'])]
    return all_df

In [18]:
def network_creation_wrapper(trait, H, z_dir, genelist_dir, template_uuid, username, password):
    z_df = get_z_scores(trait, z_dir)
    r_genes = load_genelists(trait[0], genelist_dir, 'R')
    c_genes = load_genelists(trait[1], genelist_dir, 'C')
    all_genes = r_genes.merge(c_genes, on=['Entrez'], how='outer')
    all_genes['Entrez'] = all_genes.Entrez.astype(int)
    all_genes = assign_gene_classes(all_genes)
    z_df = combine_input_z_info(all_genes, z_df)
    coloc_df = define_colocalized_network(z_df, zth=1, zzth=3)
    Gout = nx.subgraph(H, coloc_df['Entrez'].tolist())
    nx.set_node_attributes(Gout, coloc_df.set_index('Entrez')['z_R'].to_dict(), name='NPS_R')
    nx.set_node_attributes(Gout, coloc_df.set_index('Entrez')['z_C'].to_dict(), name='NPS_C')
    nx.set_node_attributes(Gout, coloc_df.set_index('Entrez')['Z_coloc'].to_dict(), name='NPS_RC')
    nx.set_node_attributes(Gout, coloc_df.set_index('Entrez')['gene_class'].to_dict(), name='GeneClass')
    nx.set_node_attributes(Gout, coloc_df.set_index('Entrez')['coloc_gene'].to_dict(), name='COLOC Gene')
    nx.set_node_attributes(Gout, coloc_df.set_index('Entrez')['input_gene'].to_dict(), name='InputGene')
    
    #change so that it will not upload but save locally
    nx.write_gpickle(Gout, (os.path.join(cwd, f'../out/net_out/{trait[0]}_{trait[1]}_net1.gpickle')))
    
    #uuid = upload_network(Gout, trait[2], username= username, password=password, is_cx=False, template=template_uuid, networkset='287cafe2-1645-11f0-9806-005056ae3c32')
    #return uuid
    return Gout

## Create & upload subnetworks

In [None]:
uuid_list = {}
trait_list = [x for x in zip(trait_info.TraitR.values, trait_info.TraitC.values, trait_info.TRAIT.values)]
for trait in tqdm(trait_list):
    uuid_list[trait] = network_creation_wrapper(trait, H, z_dir, genelist_dir, template_uuid, username, password)

In [19]:
# calling the net creation for a single file
res_graph = network_creation_wrapper(("lupus", "lupus", "lupus"), H, z_dir, genelist_dir, "net123", "username", "password")

In [None]:
{x[2]:uuid_list[x] for x in uuid_list if x[2] in ["Alzheimer disease", 'bipolar disorder', 'autism spectrum disorder']}

In [None]:
uuid_df = pd.DataFrame.from_dict(uuid_list, orient='index', columns=['uuid']).reset_index(names='x')
uuid_df['TraitR'] = uuid_df.x.apply(lambda y: y[0])
uuid_df['TraitC'] = uuid_df.x.apply(lambda y: y[1])
uuid_df['TRAIT'] = uuid_df.x.apply(lambda y: y[2])

In [None]:
uuid_df.drop(columns=['x']).to_csv(os.path.join(cwd, '../outputs/uuid_list.txt'), sep='\t', index=False)