## Phenotype Networks: Analysis and Visualization

In [None]:
%matplotlib inline
from collections import defaultdict
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# statistics
from scipy.stats import chi2_contingency, mannwhitneyu
from scipy.stats.mstats import kruskalwallis
from scikit_posthocs import posthoc_dunn

# display dataframes
from IPython.display import display

# For graph title
import re as re_title

from math import log10, log2
import itertools

import networkx as nx
import py4cytoscape as p4c
from tqdm import tqdm
import pickle

pd.set_option('display.max_rows', 500)
np.set_printoptions(threshold=500)
pd.options.mode.chained_assignment = None  # default='warn'

import warnings
warnings.filterwarnings("default", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning) 

In [None]:
def graph_title(networkmetric):
    
    """
    Parameters
    __________
    networkmetric : str
        Network metric of interest
        
    Returns
    _______
    graph_title : str
        Network metric with words separated for visualization
    """
     
    word_list = re_title.findall('[A-Z][^A-Z]*', networkmetric)
    graph_title = ''
    for word in word_list:
        if word != word_list[-1]:
            graph_title = graph_title + word + ' '
        else:
            graph_title = graph_title + word
    
    return graph_title           

In [None]:
# Make sure there's a cytoscape connection
# Cytoscape needs to be open to communicate with it via python
p4c.cytoscape_ping()

In [None]:
n = 'phenotype' 

In [None]:
diagkeys = ['phenotype']

In [None]:
%run -i setup_functions.py

In [None]:
# parameters
total_ad = 1688 #Total MatchIt patients with AD
total_con = total_ad * 2 #Total MatchIt control patients
cutoff = .01

# Patients with Alzheimer's (AD)

In [None]:
# get the diagnosis
ad_diag_all = pd.read_csv('Diagnoses/phecode_diagnoses/ad_diagnoses.csv')

# add column that indicates order icd10_chapter
# NOTE: icd10_chapter ROUGHLY corresponds to icd-10 chapters, and some chapters are not included
ad_diag_all['chp_order'] = ad_diag_all['icd10_chapter'].apply(ICDname_order)

# Only keep diagnoses mapped to phecodes that are organized into ICD-10 inspired chapters
ad_diag = ad_diag_all[~ad_diag_all['icd10_chapter'].isnull()]

### Add demographic data

In [None]:
ad_demo = pd.read_csv('Demographics/ad_demographics.csv')

In [None]:
ad_diag = ad_diag.merge(ad_demo[['person_id', 'UCSFDerivedRaceEthnicity_Clean']], 
                        how='left', 
                        left_on='person_id', 
                        right_on='person_id')

#### Only keep AD patients from MatchIt

In [None]:
# Get person_ids for MatchIt Alzheimer's and Control patients
ad_MatchIt = pd.read_csv('Demographics/RE_MI_ad_demo.csv')
con_MatchIt = pd.read_csv('Demographics/RE_MI_con_demo.csv')

In [None]:
ad_diag = ad_diag[ad_diag['person_id'].isin(ad_MatchIt['person_id'])]

In [None]:
ad_diag_count = countPtsDiagnosis_Dict(ad_diag, total_ad)

Number of patients stratified by race/ethnicity

In [None]:
ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique()

In [None]:
numread = dict()

for re in ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique():
    numread[re] = ad_diag[ad_diag['UCSFDerivedRaceEthnicity_Clean'] == re][['person_id',
                                                                            'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates().shape[0]

In [None]:
numread

# Control patients

In [None]:
# get the diagnosis
con_diag_all = pd.read_csv('Diagnoses/phecode_diagnoses/con_diagnoses.csv')

# cond column that indicates order icd10_chapter
# NOTE: icd10_chapter ROUGHLY corresponds to icd-10 chapters, and some chapters are not included
con_diag_all['chp_order'] = con_diag_all['icd10_chapter'].apply(ICDname_order)

# Only keep diagnoses mapped to phecodes that are organized into ICD-10 inspired chapters
con_diag = con_diag_all[~con_diag_all['icd10_chapter'].isnull()]

In [None]:
con_diag_all['person_id'].unique().shape

In [None]:
# some patients without diagnoses lost when filtering for diagnoses mapped to phecodes
# that have ICD-10 inspired chapters; will add them back in after adding demographic data
con_diag['person_id'].unique().shape

### Add demographic data

In [None]:
con_demo = pd.read_csv('Demographics/con_demographics.csv')

In [None]:
# Merge con_demo info to retain the remaining patients:
con_diag = con_demo['person_id'].to_frame().merge(con_diag,
                                                  how='left',
                                                  on='person_id')

In [None]:
# Should have the smae number patients as con_diag_all['person_id'].nunique()
con_diag['person_id'].unique().shape

#### Only keep control patients from MatchIt

In [None]:
# Get person_ids for MatchIt Control patients
con_MatchIt = pd.read_csv('Demographics/RE_MI_con_demo.csv')

In [None]:
con_diag = con_diag.merge(con_demo[['person_id', 'UCSFDerivedRaceEthnicity_Clean']], 
                          how='left', 
                          left_on='person_id', 
                          right_on='person_id')

In [None]:
con_diag = con_diag[con_diag['person_id'].isin(con_MatchIt['person_id'])]

In [None]:
con_diag_count = countPtsDiagnosis_Dict(con_diag, total_con)

Number of patients stratified by race/ethnicity

In [None]:
con_diag['UCSFDerivedRaceEthnicity_Clean'].unique()

In [None]:
numrecon = dict()

for re in con_diag['UCSFDerivedRaceEthnicity_Clean'].unique():
    numrecon[re] = con_diag[con_diag['UCSFDerivedRaceEthnicity_Clean'] == re][['person_id',
                                                                               'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates().shape[0]

In [None]:
numrecon

## AD graph

In [None]:
# Initialize graph
AD_Graph = nx.Graph()
# Add nodes to graph
AD_nodes = ad_diag_count[n][n]
AD_Graph.add_nodes_from(AD_nodes)

In [None]:
# For each disease, get AD/control distribution and unique patients
print('Count number of patients per node...')
diagtemp = ad_diag[['person_id',n,'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates()

In [None]:
# Make table where index is phenotype
# for each phenotype is dict of # pts with that phenotype stratified by 
# race/ethnicity; person_id is total # pts with that phenotype
diagtemp = pd.pivot_table(diagtemp, 
                          values=['person_id', 'UCSFDerivedRaceEthnicity_Clean'], 
                          index=n,
                          aggfunc={'person_id' : lambda x: len(x.unique()), 
                                   'UCSFDerivedRaceEthnicity_Clean' : lambda x: dict(x.value_counts())})

In [None]:
diagtemp = diagtemp.sort_values('person_id', ascending=False)

In [None]:
AD_node_attr = diagtemp[diagtemp.index.isin(AD_nodes)]

In [None]:
# Add race/ethnicity information
print('Set race/ethnicity as Attributes...')

In [None]:
for re in ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique():
    AD_node_attr[re] = AD_node_attr['UCSFDerivedRaceEthnicity_Clean'].apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re])
    AD_node_attr['p'+re] = AD_node_attr['UCSFDerivedRaceEthnicity_Clean']\
                           .apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re]*100/numread[re])

In [None]:
AD_node_attr.head(3)

In [None]:
# replace percent columns with first letter(s) of each race/ethnicity
# Replace identified race and ethnicity columns to match UC-wide analysis
AD_node_attr = AD_node_attr.rename({'pAsian' : 'pA',
                                    'pBlack or African American' : 'pB',
                                    'pLatinx' : 'pL',
                                    'pWhite or Caucasian' : 'pW', 
                                    'Black or African American' : 'Black',
                                    'Latinx' : 'Latine',
                                    'White or Caucasian' : 'White'},
                                     axis=1)

In [None]:
AD_node_attr = AD_node_attr.drop('UCSFDerivedRaceEthnicity_Clean', axis=1)
AD_node_attr = AD_node_attr.rename(columns={'person_id' : 'PtCount'})

**Merge in icd10_chapter information**

In [None]:
ad_diag_pheno = ad_diag[['phenotype', 'icd10_chapter']].drop_duplicates().set_index('phenotype')

In [None]:
ad_diag_pheno.head(3)

In [None]:
print('Adding icd10_chapter information...')
AD_node_attr = AD_node_attr.merge(ad_diag_pheno, left_index=True, right_index=True, how='left')

In [None]:
AD_node_attr.head(3)

In [None]:
AD_node_attr.shape

In [None]:
# Save phenotypes 
AD_node_attr.index.to_frame().to_csv('Tables/AD_ntwrk_phenotypes.csv', index=False)

In [None]:
AD_node_attr = AD_node_attr.to_dict(orient='index') # Make the columns into a dictionary for node attributes.

In [None]:
nx.set_node_attributes(AD_Graph, AD_node_attr)

In [None]:
print('Create all edges...')

# make a dataframe of edges
diagtemp = ad_diag[['person_id',n,'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates()
diagtemp = diagtemp[diagtemp[n].isin(AD_nodes)]
grouped = diagtemp.groupby('person_id')

In [None]:
AD_edges = []
for k, pt in tqdm(list(grouped)):
    # n choose k, where n is number of phenotypes (pt[n].sort_values()), and r is k:
    #combo_list = list(itertools.combinations(pt[n], r=2)); modified below so phenotypes are sorted for patients
    combo_list = list(itertools.combinations(pt[n].sort_values(), r=2))
    combo_list = [(item,) for item in combo_list]
    combo_df = pd.DataFrame(combo_list, columns=[n+'Combo']).drop_duplicates()
    df_len = combo_df.shape[0]
    combo_df['person_id'] = pt['person_id'].values[0]
    combo_df['UCSFDerivedRaceEthnicity_Clean'] = pt['UCSFDerivedRaceEthnicity_Clean'].values[0]
    AD_edges.append(combo_df)

In [None]:
AD_edges[0].head(3)

In [None]:
AD_edges = pd.concat(AD_edges).reset_index(drop=True)

In [None]:
AD_edges.head(3)

In [None]:
# Add to graph
AD_Graph.add_edges_from(AD_edges[n+'Combo'])

In [None]:
# Count the number of patient for each edge.
# May need to wait a few hours
diagtemp = AD_edges
diagtemp = pd.pivot_table(diagtemp, 
                          values=['person_id','UCSFDerivedRaceEthnicity_Clean'], 
                          index=n+'Combo',
                          aggfunc={'person_id': lambda x: len(x.unique()), 
                                   'UCSFDerivedRaceEthnicity_Clean': lambda x: dict(x.value_counts())})

print('sorting...')
diagtemp = diagtemp.sort_values('person_id', ascending=False)

Add edge attributes

In [None]:
diagtemp.head(3)

In [None]:
AD_edge_attr = diagtemp

In [None]:
for re in ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique():
    AD_edge_attr[re] = AD_edge_attr['UCSFDerivedRaceEthnicity_Clean'].apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re])
    AD_edge_attr['p'+str(re)] = AD_edge_attr['UCSFDerivedRaceEthnicity_Clean']\
                              .apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re]*100/numread[re])

In [None]:
AD_edge_attr.head(3)

In [None]:
# replace percent columns with first letter(s) of each race/ethnicity
AD_edge_attr = AD_edge_attr.rename({'pAsian' : 'pA',
                                    'pBlack or African American' : 'pB',
                                    'pLatinx' : 'pL',
                                    'pWhite or Caucasian' : 'pW', 
                                    'Black or African American' : 'Black',
                                    'Latinx' : 'Latine',
                                    'White or Caucasian' : 'White'},
                                     axis=1)

In [None]:
AD_edge_attr = AD_edge_attr.drop('UCSFDerivedRaceEthnicity_Clean', axis=1)
AD_edge_attr = AD_edge_attr.rename(columns={'person_id' : 'PtCount'})

# Make dictionary
AD_edge_attr = AD_edge_attr.to_dict(orient='index')
nx.set_edge_attributes(AD_Graph, AD_edge_attr)

print(nx.info(AD_Graph))

In [None]:
# Save file
if os.path.isdir('Network_Analysis'):
    if os.path.isdir('Network_Analysis/ADCon_phe'):
        nx.write_graphml(AD_Graph,'Network_Analysis/ADCon_phe/'+n+'graph_AD_ADCon.graphml')
    else:
        os.mkdir('Network_Analysis/ADCon_phe')
        nx.write_graphml(AD_Graph,'Network_Analysis/ADCon_phe/'+n+'graph_AD_ADCon.graphml')
else:
    os.mkdir('Network_Analysis')
    os.mkdir('Network_Analysis/ADCon_phe')
    nx.write_graphml(AD_Graph,'Network_Analysis/ADCon_phe/'+n+'graph_AD_ADCon.graphml')

## Control graph

In [None]:
# Initialize graph
con_Graph = nx.Graph()
# Add nodes to graph
con_nodes = con_diag_count[n][n]
con_Graph.add_nodes_from(con_nodes)

In [None]:
# For each disease, get AD/control distribution and unique patients
print('Count number of patients per node...')
diagtemp = con_diag[['person_id',n,'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates()

In [None]:
# Make table where index is phenotype
# for each phenotype is dict of # pts with that phenotype stratified by 
# race/ethnicity; person_id is total # pts with that phenotype
diagtemp = pd.pivot_table(diagtemp, 
                          values=['person_id', 'UCSFDerivedRaceEthnicity_Clean'], 
                          index=n,
                          aggfunc={'person_id' : lambda x: len(x.unique()), 
                                   'UCSFDerivedRaceEthnicity_Clean' : lambda x: dict(x.value_counts())})

In [None]:
diagtemp = diagtemp.sort_values('person_id', ascending=False)

In [None]:
con_node_attr = diagtemp[diagtemp.index.isin(con_nodes)]

In [None]:
con_node_attr.head(3)

In [None]:
# Add race/ethnicity information
print('Set race/ethnicity as Attributes...')

In [None]:
for re in con_diag['UCSFDerivedRaceEthnicity_Clean'].unique():
    con_node_attr[re] = con_node_attr['UCSFDerivedRaceEthnicity_Clean'].apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re])
    con_node_attr['p'+re] = con_node_attr['UCSFDerivedRaceEthnicity_Clean']\
                           .apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re]*100/numrecon[re])

In [None]:
con_node_attr.head(3)

In [None]:
# replace percent columns with first letter(s) of each race/ethnicity
con_node_attr = con_node_attr.rename({'pAsian' : 'pA',
                                    'pBlack or African American' : 'pB',
                                    'pLatinx' : 'pL',
                                    'pWhite or Caucasian' : 'pW', 
                                    'Black or African American' : 'Black',
                                    'Latinx' : 'Latine',
                                    'White or Caucasian' : 'White'},
                                     axis=1)

In [None]:
con_node_attr = con_node_attr.drop('UCSFDerivedRaceEthnicity_Clean', axis=1)
con_node_attr = con_node_attr.rename(columns={'person_id' : 'PtCount'})

**Merge in icd10_chapter information**

In [None]:
con_diag_pheno = con_diag[['phenotype', 'icd10_chapter']].drop_duplicates().set_index('phenotype')

In [None]:
print('Adding icd10_chapter information...')
con_node_attr = con_node_attr.merge(con_diag_pheno, left_index=True, right_index=True, how='left')

In [None]:
con_node_attr.head(3)

In [None]:
con_node_attr.shape

In [None]:
con_node_attr.index.to_frame()

In [None]:
# Save phenotypes 
con_node_attr.index.to_frame().to_csv('Tables/con_ntwrk_phenotypes.csv', index=False)

In [None]:
con_node_attr = con_node_attr.to_dict(orient='index') # Make the columns into a dictionary for node attributes.

In [None]:
nx.set_node_attributes(con_Graph, con_node_attr)

In [None]:
print('Create all edges...')

# make a dataframe of edges
diagtemp = con_diag[['person_id',n,'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates()
diagtemp = diagtemp[diagtemp[n].isin(con_nodes)]
grouped = diagtemp.groupby('person_id')

In [None]:
con_edges = []
for k, pt in tqdm(list(grouped)):
    # n choose k, where n is number of phenotypes (pt[n].sort_values()), and r is k:
    #combo_list = list(itertools.combinations(pt[n], r=2)); modified below so phenotypes are sorted for patients
    combo_list = list(itertools.combinations(pt[n].sort_values(), r=2))
    combo_list = [(item,) for item in combo_list]
    combo_df = pd.DataFrame(combo_list, columns=[n+'Combo']).drop_duplicates()
    df_len = combo_df.shape[0]
    combo_df['person_id'] = pt['person_id'].values[0]
    combo_df['UCSFDerivedRaceEthnicity_Clean'] = pt['UCSFDerivedRaceEthnicity_Clean'].values[0]
    con_edges.append(combo_df)

2049 out of 3376 patients have phenotypes associated with them

In [None]:
con_edges[2].head(3)

In [None]:
con_edges = pd.concat(con_edges).reset_index(drop=True)

In [None]:
con_edges.head(3)

In [None]:
# Add to graph
con_Graph.add_edges_from(con_edges[n+'Combo'])

In [None]:
# Count the number of patient for each edge.
# Need to wait a few hours
diagtemp = con_edges
diagtemp = pd.pivot_table(diagtemp, 
                          values=['person_id','UCSFDerivedRaceEthnicity_Clean'], 
                          index=n+'Combo',
                          aggfunc={'person_id': lambda x: len(x.unique()), 
                                   'UCSFDerivedRaceEthnicity_Clean': lambda x: dict(x.value_counts())})

print('sorting...')
diagtemp = diagtemp.sort_values('person_id', ascending=False)

Add edge attributes

In [None]:
con_edge_attr = diagtemp

In [None]:
for re in con_diag['UCSFDerivedRaceEthnicity_Clean'].unique():
    con_edge_attr[re] = con_edge_attr['UCSFDerivedRaceEthnicity_Clean'].apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re])
    con_edge_attr['p'+str(re)] = con_edge_attr['UCSFDerivedRaceEthnicity_Clean']\
                              .apply(lambda lst: 0 if (re not in list(lst.keys())) else lst[re]*100/numrecon[re])

In [None]:
con_edge_attr.head(3)

In [None]:
# replace percent columns with first letter(s) of each race/ethnicity
con_edge_attr = con_edge_attr.rename({'pAsian' : 'pA',
                                    'pBlack or African American' : 'pB',
                                    'pLatinx' : 'pL',
                                    'pWhite or Caucasian' : 'pW', 
                                    'Black or African American' : 'Black',
                                    'Latinx' : 'Latine',
                                    'White or Caucasian' : 'White'},
                                     axis=1)

In [None]:
con_edge_attr = con_edge_attr.drop('UCSFDerivedRaceEthnicity_Clean', axis=1)
con_edge_attr = con_edge_attr.rename(columns={'person_id' : 'PtCount'})

# Make dictionary
con_edge_attr = con_edge_attr.to_dict(orient='index')
nx.set_edge_attributes(con_Graph, con_edge_attr)

print(nx.info(con_Graph))

In [None]:
pd.DataFrame(con_edge_attr).transpose()

In [None]:
# Save file
if os.path.isdir('Network_Analysis'):
    if os.path.isdir('Network_Analysis/ADCon_phe'):
        nx.write_graphml(con_Graph,'Network_Analysis/ADCon_phe/'+n+'graph_con_ADCon.graphml')
    else:
        os.mkdir('Network_Analysis/ADCon_phe')
        nx.write_graphml(con_Graph,'Network_Analysis/ADCon_phe/'+n+'graph_con_ADCon.graphml')
else:
    os.mkdir('Network_Analysis')
    os.mkdir('Network_Analysis/ADCon_phe')
    nx.write_graphml(con_Graph,'Network_Analysis/ADCon_phe/'+n+'graph_con_ADCon.graphml')

### Network statistics

Make race/ethnicity stratified AD graphs, where patients within each race/ethnicity share 5% of nodes and 5% of edges

In [None]:
graph_cutoff = 5

In [None]:
# pA - percent Asian
# pB - percent Black
# pL - percent Latine
# pW - percent White
pct_re = ['pA', 'pB', 'pL', 'pW']
ad_re = ['Asian_AD', 'Black_AD', 'Latine_AD', 'White_AD']
con_re = ['Asian_con', 'Black_con', 'Latine_con', 'White_con']

In [None]:
AD_Graph.name = 'phenotype'
con_Graph.name = 'phenotype'

In [None]:
print(nx.info(AD_Graph))

In [None]:
print(nx.info(con_Graph))

In [None]:
UCSF_networks = dict()

AD graphs

In [None]:
for pct, label in zip(pct_re, ad_re):
    selected_nodes = [n for n,v in AD_Graph.nodes(data=True) if v[pct] > graph_cutoff]
    AD_Graph_x = AD_Graph.subgraph(selected_nodes).copy()
    
    for u,v,e in AD_Graph.edges(data=True):
        if pct not in e:
            if AD_Graph_x.has_edge(*(u,v)): 
                AD_Graph_x.remove_edge(*(u,v))
        elif e[pct] <= graph_cutoff:
            if AD_Graph_x.has_edge(*(u,v)): 
                AD_Graph_x.remove_edge(*(u,v))
    
    AD_Graph_x.name = label + '_phenotype'
    print('before singletons removed:\n', nx.info(AD_Graph_x), '\n')
    
    # Remove singletons
    AD_Graph_x.remove_nodes_from(list(nx.isolates(AD_Graph_x)))
    print('after singletons removed:\n', nx.info(AD_Graph_x), '\n')
    
    # Save file
    nx.write_graphml(AD_Graph_x,'Network_Analysis/ADCon_phe/'+n+'_'+label+'_'+str(graph_cutoff)+'.graphml')
    
    # Create network in Cytoscape
    p4c.create_network_from_networkx(AD_Graph_x, title=label+'_'+str(graph_cutoff))
    
    # Add info to UCSF networks dictionary
    UCSF_networks[label+'_'+str(graph_cutoff)] = p4c.get_network_suid(title=label+'_'+str(graph_cutoff))

control graphs

In [None]:
for pct, label in zip(pct_re, con_re):
    selected_nodes = [n for n,v in con_Graph.nodes(data=True) if v[pct] > graph_cutoff]
    con_Graph_x = con_Graph.subgraph(selected_nodes).copy()
    
    for u,v,e in con_Graph.edges(data=True):
        if pct not in e:
            if con_Graph_x.has_edge(*(u,v)): 
                con_Graph_x.remove_edge(*(u,v))
        elif e[pct] <= graph_cutoff:
            if con_Graph_x.has_edge(*(u,v)): 
                con_Graph_x.remove_edge(*(u,v))
    
    con_Graph_x.name = label + '_phenotype'
    print('before singletons removed:\n', nx.info(con_Graph_x), '\n')
    
    # Remove singletons
    con_Graph_x.remove_nodes_from(list(nx.isolates(con_Graph_x)))
    print('after singletons removed:\n', nx.info(con_Graph_x), '\n')
    
    # Save file
    nx.write_graphml(con_Graph_x,'Network_Analysis/ADCon_phe/'+n+'_'+label+'_'+str(graph_cutoff)+'.graphml')
    
    # Create network in Cytoscape
    p4c.create_network_from_networkx(con_Graph_x, title=label+'_'+str(graph_cutoff))
    
    # Add info to UCSF networks dictionary
    UCSF_networks[label+'_'+str(graph_cutoff)] = p4c.get_network_suid(title=label+'_'+str(graph_cutoff))

In [None]:
UCSF_networks

In [None]:
# Summary network statistics
ntwrk_summ_stats = dict()

for network, SUID in UCSF_networks.items():
    p4c.set_current_network(network=network)
    ntwrk_summ_stats[network] = p4c.analyze_network()        

In [None]:
#ntwrk_summ_stats

In [None]:
# Save summary statistics
pd.DataFrame(ntwrk_summ_stats).transpose().sort_index().to_csv('Network_Analysis\\ADCon_phe\\summ_stats_'+str(graph_cutoff)+'.csv',
                                                               index_label='network')

Compare AD networks with each other

In [None]:
network_metrics = ['AverageShortestPathLength',
                   'ClusteringCoefficient',
                   'ClosenessCentrality',
                   'Eccentricity',
                   'Stress',
                   'Degree',
                   'BetweennessCentrality',
                   'NeighborhoodConnectivity',
                   'Radiality',
                   'TopologicalCoefficient']

In [None]:
# Retrieve node tables
network_metric_values = dict()

for network, SUID in UCSF_networks.items():
    network_metric_values[network] = p4c.get_table_columns(table='node',  
                                                           namespace='default', 
                                                           network=network, 
                                                           base_url='http://127.0.0.1:1234/v1')

In [None]:
for network in network_metric_values:
    print(network_metric_values[network].shape)

In [None]:
# Save node tables
if os.path.isdir('Network_Analysis/ADCon_phe/node_tables'):
    for network in network_metric_values:
        pd.DataFrame(network_metric_values[network]).to_csv('Network_Analysis/ADCon_phe/node_tables/'+network+'_node_table.csv')
else:
    os.mkdir('Network_Analysis/ADCon_phe/node_tables')
    for network in network_metric_values:
        pd.DataFrame(network_metric_values[network]).to_csv('Network_Analysis/ADCon_phe/node_tables/'+network+'_node_table.csv')

Comparing AD networks with each other

In [None]:
fig, axs = plt.subplots(5,2, figsize=(25,20))
i = 0 # row
j = 0 # column
for metric in network_metrics:
    A_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Asian_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    B_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Black_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    L_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Latine_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    W_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='White_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    
    # Kruskal Wallis
    stat, pval = kruskalwallis(A_AD, B_AD, L_AD, W_AD)
    
    # Add metric distribution to subplot
    if j < 2:
        axs[i,j].hist([A_AD, B_AD, L_AD, W_AD], color=['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3'])
        axs[i,j].set_title(graph_title(metric), fontsize=18, fontweight='bold')
        axs[i,j].set_ylabel('\n \n')
        axs[i,j].tick_params(axis='both', which='both', labelsize=18)
        if pval < 0.05:
            # Added this after seeing distributions
            if metric in ['ClosenessCentrality',
                          'Stress',
                          'Degree',
                          'BetweennessCentrality',
                          'NumberOfUndirectedEdges']:
                axs[i,j].text(0.9, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
            else:
                axs[i,j].text(0.1, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
        j += 1
    else:
        i += 1
        j = 0
        axs[i,j].hist([A_AD, B_AD, L_AD, W_AD], color=['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3'])
        axs[i,j].set_title(graph_title(metric), fontsize=18, fontweight='bold')
        axs[i,j].set_ylabel('\n \n')
        axs[i,j].tick_params(axis='both', which='both', labelsize=18)
        if pval < 0.05:
            # Added this after seeing distributions
            if metric in ['ClosenessCentrality',
                          'Stress',
                          'Degree',
                          'BetweennessCentrality',
                          'NumberOfUndirectedEdges']:
                axs[i,j].text(0.9, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
            else:
                axs[i,j].text(0.1, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
        j += 1     
    
    if pval < 0.05:
        sig = 'significantly different'
    else:
        sig = 'not significantly different'
    
    print(metric + " for AD patients' comparison is {}; statistic is {} and p-value is {}.".format(sig,
                                                                                                   stat,
                                                                                                   pval))
    print('\n')
    
fig.legend(['Asian', 'Black', 'Latine', 'White'], 
           loc='upper right',
           bbox_to_anchor=(1, 1.1),
           fontsize=18)

fig.tight_layout()

# Save figure
if os.path.isdir('Figures/Network_Analysis'):
    plt.savefig('Figures/Network_Analysis/SuppFig_5.pdf', bbox_inches='tight')
else:
    os.mkdir('Figures/Network_Analysis')
    plt.savefig('Figures/Network_Analysis/SuppFig_5.pdf', bbox_inches='tight')

plt.show()

### Save Dunn's test results for patients with AD to derive test statistics using R package `dunn.test`

In [None]:
base_dir = os.getcwd()

In [None]:
if not os.path.isdir(base_dir+'\\Tables\\R_Dunn'):
    os.mkdir(base_dir+'\\Tables\\R_Dunn')

In [None]:
for metric in network_metrics:
    
    if not os.path.isdir(base_dir + '\\Tables\\R_Dunn\\' + metric):
        os.mkdir(base_dir + '\\Tables\\R_Dunn\\' + metric)
                         
                         
    
    A_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Asian_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_frame()
    A_AD = A_AD.reset_index(drop=True)
    
    A_AD.to_csv(base_dir + '\\Tables\\R_Dunn\\' + metric + '\\A_AD_' + metric +'.csv')
    
    B_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Black_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric]
    B_AD = B_AD.reset_index(drop=True)
    
    B_AD.to_csv(base_dir + '\\Tables\\R_Dunn\\' + metric + '\\B_AD_' + metric +'.csv')
    
    L_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Latine_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric]
    L_AD = L_AD.reset_index(drop=True)
    
    L_AD.to_csv(base_dir + '\\Tables\\R_Dunn\\' + metric + '\\L_AD_' + metric +'.csv')
    
    W_AD = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='White_AD_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric]
    W_AD = W_AD.reset_index(drop=True)
    
    W_AD.to_csv(base_dir + '\\Tables\\R_Dunn\\' + metric + '\\W_AD_' + metric +'.csv')

Comparing control networks with each other

In [None]:
fig, axs = plt.subplots(5,2, figsize=(25,20))
i = 0 # row
j = 0 # column
for metric in network_metrics:
    A_con = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Asian_con_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    B_con = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Black_con_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    L_con = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='Latine_con_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    W_con = p4c.get_table_columns(table='node', 
                                 columns=metric, 
                                 namespace='default', 
                                 network='White_con_'+str(graph_cutoff), 
                                 base_url='http://127.0.0.1:1234/v1')[metric].to_list()
    
    # Kruskal Wallis
    stat, pval = kruskalwallis(A_con, B_con, L_con, W_con)
    
    # Add metric distribution to subplot
    if j < 2:
        axs[i,j].hist([A_con, B_con, L_con, W_con], color=['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3'])
        axs[i,j].set_title(graph_title(metric), fontsize=18, fontweight='bold')
        axs[i,j].set_ylabel('\n \n')
        axs[i,j].tick_params(axis='both', which='both', labelsize=18)
        if pval < 0.05:
            # Added this after seeing distributions
            if metric in ['ClosenessCentrality',
                          'Stress',
                          'Degree',
                          'BetweennessCentrality',
                          'NumberOfUndirectedEdges',
                          'TopologicalCoefficient']:
                axs[i,j].text(0.9, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
            else:
                axs[i,j].text(0.1, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
        j += 1
    else:
        i += 1
        j = 0
        axs[i,j].hist([A_con, B_con, L_con, W_con], color=['#66C2A5', '#FC8D62', '#8DA0CB', '#E78AC3'])
        axs[i,j].set_title(graph_title(metric), fontsize=18, fontweight='bold')
        axs[i,j].set_ylabel('\n \n')
        axs[i,j].tick_params(axis='both', which='both', labelsize=18)
        if pval < 0.05:
            # Added this after seeing distributions
            if metric in ['ClosenessCentrality',
                          'Stress',
                          'Degree',
                          'BetweennessCentrality',
                          'NumberOfUndirectedEdges',
                          'TopologicalCoefficient']:
                print(metric)
                axs[i,j].text(0.9, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
            else:
                axs[i,j].text(0.1, 
                              0.9, 
                              'p-value < 0.05', 
                              horizontalalignment='center', 
                              verticalalignment='center',
                              transform=axs[i,j].transAxes, fontsize=16)
        j += 1     
    
    if pval < 0.05:
        sig = 'significantly different'
    else:
        sig = 'not significantly different'

        
    print(metric + " for control patients' comparison is {}; statistic is {} and p-value is {}.".format(sig,
                                                                                                   stat,
                                                                                                   pval))
    print('\n')

fig.legend(['Asian', 'Black', 'Latine', 'White'], 
           loc='upper right',
           bbox_to_anchor=(1, 1.1),
           fontsize=18)

fig.tight_layout()

# Save figure

if os.path.isdir('Figures/Network_Analysis/'):
    plt.savefig('Figures/Network_Analysis/SuppFig_6.pdf', bbox_inches='tight')
else:
    os.mkdir('Figures/Network_Analysis/')
    plt.savefig('Figures/Network_Analysis/SuppFig_6.pdf', bbox_inches='tight')

plt.show()

Compare race/ethnicity-stratified AD and control

In [None]:
mann_whitney_results = dict()

In [None]:
AD_con_comp = [['Asian_AD_'+str(graph_cutoff), 'Asian_con_'+str(graph_cutoff)],
               ['Black_AD_'+str(graph_cutoff), 'Black_con_'+str(graph_cutoff)],
               ['Latine_AD_'+str(graph_cutoff), 'Latine_con_'+str(graph_cutoff)],
               ['White_AD_'+str(graph_cutoff), 'White_con_'+str(graph_cutoff)]]

In [None]:
for comp in AD_con_comp:
    metric_results = dict()
    for metric in network_metrics:
        AD = p4c.get_table_columns(table='node', 
                                     columns=metric, 
                                     namespace='default', 
                                     network=comp[0], 
                                     base_url='http://127.0.0.1:1234/v1')[metric].to_list()
        con = p4c.get_table_columns(table='node', 
                                     columns=metric, 
                                     namespace='default', 
                                     network=comp[1], 
                                     base_url='http://127.0.0.1:1234/v1')[metric].to_list()
        

        # Mann Whitney U
        stat, pval = mannwhitneyu(AD, con, alternative='two-sided')
        if pval < 0.05:
            sig = 'significantly different'
        else:
            sig = 'not significantly different'
    
        # Means for metrics
        AD_mean = np.asarray(AD).mean()
        con_mean = np.asarray(con).mean()
    
        metric_results[metric+'_pval'] = pval
        metric_results[metric+'_AD_mean'] = AD_mean
        metric_results[metric+'_con_mean'] = con_mean
    
        mann_whitney_results[comp[0]+'_vs_'+comp[1]] = metric_results
    
    
        print(metric + " for {} and {} comparison is {}; statistic is {} and p-value is {}.".format(comp[0],
                                                                                                comp[1],
                                                                                                sig,
                                                                                                stat,
                                                                                                pval))
        print('\n')
        print('Mean ' + metric +  ' for {} patients is {}; mean '.format(comp[0], round(AD_mean, 3)) + metric + \
              ' for {} patients is {}'.format(comp[1], round(con_mean, 3)))
        print('\n \n \n')

In [None]:
# Save results
pd.DataFrame(mann_whitney_results).transpose().to_csv('Network_Analysis/ADCon_phe/AD_con_'+str(graph_cutoff)+'_mannwhitneyu.csv', 
                                                      index_label='comparison')

In [None]:
# look at p values only:
pd.DataFrame(mann_whitney_results).transpose().filter(like='_pval', axis=1)

### Visualize data: >25% cutoff (shared nodes)

In [None]:
for network, SUID in UCSF_networks.items():
    p4c.set_current_network(network=network)
    if 'Asian' in network:
        column = 'pA'
    elif 'Black' in network:
        column = 'pB'
    elif 'Latine' in network:
        column = 'pL'
    elif 'White' in network:
        column = 'pW'
    else:
        print('p value column not found for network.')
    p4c.create_column_filter(filter_name=network+'_10viz',
                             column=column,
                             criterion=25,
                             predicate='GREATER_THAN',
                             hide=True,
                             type='nodes',
                             network=network)
    p4c.apply_filter(filter_name=network+'_10viz',
                     hide=True,
                     network=network)
    p4c.set_visual_style(style_name='Sample1',
                         network=network)

In [None]:
for network, SUID in UCSF_networks.items():
    if 'con' in network:
        print('Working on {}'.format(network))
        p4c.set_current_network(network=network)
        if 'Asian' in network:
            column = 'pA'
        elif 'Black' in network:
            column = 'pB'
        elif 'Latine' in network:
            column = 'pL'
        elif 'White' in network:
            column = 'pW'
        else:
            print('p value column not found for network.')
        p4c.create_column_filter(filter_name=network+'_10viz',
                                 column=column,
                                 criterion=25,
                                 predicate='GREATER_THAN',
                                 hide=True,
                                 type='nodes',
                                 network=network)
        p4c.apply_filter(filter_name=network+'_10viz',
                         hide=True,
                         network=network)
        p4c.set_visual_style(style_name='Sample1',
                             network=network)
        print('\n')

Set node size to be based on number of patients with the condition for a given R&E category, and set node color to correspond to phecode category (icd10_chapter in node table)

In [None]:
%%html
<a href='https://py4cytoscape.readthedocs.io/en/0.0.9/concepts.html#value-generators'>Link for implementation</a>

In [None]:
# dictionary of networks
network_re_pct = dict()
for network, SUID in UCSF_networks.items():
    if 'Asian' in network:
        network_re_pct[network] = 'pA'
    elif 'Black' in network:
        network_re_pct[network] = 'pB'
    elif 'Latine' in network:
        network_re_pct[network] = 'pL'
    elif 'White' in network:
        network_re_pct[network] = 'pW'
    else:
        print('Race/ethnicity category not in network; check conditionals')

In [None]:
network_re_pct

Set node color mapping

In [None]:
# Get network with highest number of unique icd-10-inspired chapters
# Ideally, all the chapters that could be included are included in this network - will double check
icd10_network = 'None'
icd10_network_chp = 'None'
icd10_network_length = 0
for network in p4c.get_network_list():
    temp = len(p4c.get_table_columns(table='node',  
                                     namespace='default', 
                                     columns='icd10_chapter',
                                     network=network, 
                                     base_url='http://127.0.0.1:1234/v1')['icd10_chapter'].unique())
    chp = p4c.get_table_columns(table='node',  
                                namespace='default', 
                                columns='icd10_chapter',
                                network=network, 
                                base_url='http://127.0.0.1:1234/v1')['icd10_chapter'].unique()
    print('{} network has {} unique phecode chapters'.format(network, temp))
    
    if temp == icd10_network_length:
        if len(set(chp) & icd10_network_chp) == icd10_network_length:
            print('{} network has the same chapters as the network previously identified with the most chapters'.format(network))
        else:
            print('{} network does not have the same chapters as the network previously identified with the most chapters'.format(network))
    if temp > icd10_network_length:
        icd10_network_length = temp
        icd10_network = network
        print(chp)
        icd10_network_chp = set(chp)
    
    print('\n')

In [None]:
icd10_network

In [None]:
node_color_map = p4c.gen_node_color_map(table_column='icd10_chapter', 
                                        palette=p4c.palette_color_brewer_q_Set3(),
                                        mapping_type='d', 
                                        network=icd10_network,
                                        style_name='Sample1')

In [None]:
for network in p4c.get_network_list():
    print('Working on {}'.format(network))
    p4c.set_current_network(network=network)
    
    
    
    node_size_map = p4c.gen_node_size_map(table_column=network_re_pct[network], 
                                          mapping_type='c', 
                                          network=network, 
                                          style_name='Sample1')
    
    print(node_size_map['sizes'])
    print('\n')
    # change sizes to wider range of values
    '''
    if 'AD' in network:
        node_size_map['sizes'] = [10, 1995, 4000] # 10, 500, 900
    elif 'con' in network:
        node_size_map['sizes'] = [100, 200, 300]
    '''
    node_size_map['sizes'] = [100, 200 ,300]
    p4c.set_node_size_mapping(**node_size_map)
    p4c.set_node_font_size_mapping(**node_size_map)
    
    edge_width_map = p4c.gen_edge_width_map(table_column=network_re_pct[network], 
                                            mapping_type='c', 
                                            network=network, 
                                            style_name='Sample1')
    # change width key to smaller values
    # edge_width_map['widths'] = [3, 15, 75]
    p4c.set_edge_line_width_mapping(**edge_width_map)
    #p4c.set_edge_opacity_mapping(**p4c.gen_edge_opacity_map(table_column=network_re_pct[network], 
                                                             #mapping_type='c', 
                                                             #network=network, 
                                                             #style_name='Sample1'))
    
    p4c.set_node_color_mapping(**node_color_map)
    p4c.layout_network(layout_name='circular', network=network)
    p4c.style_bypasses.set_network_zoom_bypass(new_value=0.3, bypass=True, network=network)
    print('\n')

In [None]:
for network in p4c.get_network_list():
    p4c.style_bypasses.set_network_zoom_bypass(new_value=0.25, bypass=True, network=network)

In [None]:
p4c.set_edge_opacity_default(new_opacity=120)

In [None]:
p4c.save_session('Ntwrk_Viz')

In [None]:
for network in p4c.get_network_list():
    print('Saving {} image'.format(network))
    p4c.set_current_network(network=network)
    p4c.export_image(filename=network+'.pdf', type='pdf')