## Supplementary Data File 6 and 7

### Only start here if edge tables haven't been made

In [None]:
import py4cytoscape as p4c
import os
import pandas as pd

# display dataframes
from IPython.display import display

In [None]:
os.chdir("..")

In [None]:
p4c.cytoscape_ping()

In [None]:
# Make sure UCSF networks are showing up
for network in p4c.get_network_list():
    print(network)

Get edge tables for UCSF networks
- Used Ntwrk_Viz_UCSF_v2b file

In [None]:
# Retrieve edge tables
network_metric_values = dict()

for network in p4c.get_network_list():
    network_metric_values[network] = p4c.get_table_columns(table='edge',  
                                                           namespace='default', 
                                                           network=network, 
                                                           base_url='http://127.0.0.1:1234/v1')

In [None]:
network_metric_values.keys()

In [None]:
# Save edge tables
if os.path.isdir('Network_Analysis/ADCon_phe/edge_tables'):
    for network in network_metric_values:
        pd.DataFrame(network_metric_values[network]).to_csv('Network_Analysis/ADCon_phe/edge_tables/'+network+'_edge_table.csv')
else:
    os.mkdir('Network_Analysis/ADCon_phe/edge_tables')
    for network in network_metric_values:
        pd.DataFrame(network_metric_values[network]).to_csv('Network_Analysis/ADCon_phe/edge_tables/'+network+'_edge_table.csv')

Get edge tables for UCDDP networks
- Used UCDDP/Ntwrk_viz_v3
- Need to close cytoscape and reopen to this file (could also probably just open new .cys file)

In [None]:
# Make sure UCDDP networks are showing up
for network in p4c.get_network_list():
    print(network)

In [None]:
# Retrieve edge tables
network_metric_values = dict()

for network in p4c.get_network_list():
    network_metric_values[network] = p4c.get_table_columns(table='edge',  
                                                           namespace='default', 
                                                           network=network, 
                                                           base_url='http://127.0.0.1:1234/v1')

In [None]:
# Save edge tables
if os.path.isdir('Network_Analysis/UCDDP/ADCon_phe/edge_tables'):
    for network in network_metric_values:
        pd.DataFrame(network_metric_values[network]).to_csv('Network_Analysis/UCDDP/ADCon_phe/edge_tables/'+network+'_5_edge_table.csv')
else:
    os.mkdir('Network_Analysis/UCDDP/ADCon_phe/edge_tables')
    for network in network_metric_values:
        pd.DataFrame(network_metric_values[network]).to_csv('Network_Analysis/UCDDP/ADCon_phe/edge_tables/'+network+'_5_edge_table.csv')

## Start here if already made edge tables

Find top interactions in common for patients with AD between UCSF and UC-wide validation cohort for each identified race and ethnicity

In [None]:
import py4cytoscape as p4c
import os
import pandas as pd

# display dataframes
from IPython.display import display

**Only change directory if above hasn't been run**

In [None]:
# os.chdir("..")

In [None]:
race_ethnicities = ['Asian', 'Black', 'Latine', 'White']
pct_re_dict = {'Asian' : 'pA', 'Black' : 'pB', 'Latine' : 'pL', 'White' : 'pW'}

In [None]:
top_ADinteract_dict = dict()

for re in race_ethnicities:
    UCSF_temp = pd.read_csv('Network_Analysis/ADCon_phe/edge_tables/'+re+'_AD_5_edge_table.csv')
    print('UCSF')
    display(UCSF_temp.sort_values(by=pct_re_dict[re], ascending=False).head(20))
    UCDDP_temp = pd.read_csv('Network_Analysis/UCDDP/ADCon_phe/edge_tables/'+re+'_AD_UCDDP_5_edge_table.csv')
    print('UCDDP')
    display(UCDDP_temp.sort_values(by=pct_re_dict[re], ascending=False).head(20))
    top_ADinteract_dict[re] = set()
    i = 10
    while len(top_ADinteract_dict[re]) < 10:
        # UCSF pairs
        UCSF_temp2 = set(UCSF_temp.sort_values(re, ascending=False).head(i)['name'])
        
        # Make sets of pairs that are order agnostic 
        UCSFsort_list = list()
        for pair in UCSF_temp2:
            UCSFsort = pair.split(" (interacts with) ")
            UCSFsort = sorted(UCSFsort)
            UCSFsort_list.append(UCSFsort)
            
        UCSFsort_set = set(frozenset(pair) for pair in UCSFsort_list)
            
        # UCDDP pairs 
        UCDDP_temp2 = set(UCDDP_temp.sort_values(re, ascending=False).head(i)['name'])
        
        # Make sets of pairs that are order agnostic 
        UCDDPsort_list = list()
        for pair in UCDDP_temp2:
            UCDDPsort = pair.split(" (interacts with) ")
            UCDDPsort = sorted(UCDDPsort)
            UCDDPsort_list.append(UCDDPsort)
            
        UCDDPsort_set = set(frozenset(pair) for pair in UCDDPsort_list)
        
        top_ADinteract_dict[re] = UCSFsort_set & UCDDPsort_set
        i +=1
    
    print(i)
    print('Top interactions for {}-identified patients: '.format(re))
    for pair in top_ADinteract_dict[re]:
        print(pair)
    print('\n')

In [None]:
# Overlapping interactions
overlap = top_ADinteract_dict['Asian'] & top_ADinteract_dict['Black'] & top_ADinteract_dict['Latine'] & top_ADinteract_dict['White']

In [None]:
print(len(overlap))
overlap

In [None]:
sorted(list(frozenset({"Alzheimer's disease", 'Essential hypertension'})))

In [None]:
# Find top interactions found only in one identified race and ethnicity (if applicable)
for re1 in race_ethnicities:
    temp_overlap_dict = dict()
    j = 1
    for _, re2 in enumerate(race_ethnicities):
        if re1 != re2:
            temp_overlap_dict[j] = top_ADinteract_dict[re1] - top_ADinteract_dict[re2]
            j += 1
    # for the three comparisons, print which interactions specific to identified race and ethnicity
    temp_overlap = temp_overlap_dict[1] & temp_overlap_dict[2] & temp_overlap_dict[3] 
    print('interactions specific to patients who identify as {}:'.format(re1))
    for pair in temp_overlap:
        print(pair)
        
    print('\n')

Find top interactions in common for control patients between UCSF and UC-wide validation cohort for each identified race and ethnicity

In [None]:
top_coninteract_dict = dict()

for re in race_ethnicities:
    UCSF_temp = pd.read_csv('Network_Analysis/ADCon_phe/edge_tables/'+re+'_con_5_edge_table.csv')
    UCDDP_temp = pd.read_csv('Network_Analysis/UCDDP/ADCon_phe/edge_tables/'+re+'_con_UCDDP_5_edge_table.csv')
    top_coninteract_dict[re] = set()
    if re != 'White':   
        i = 10
        while len(top_coninteract_dict[re]) < 10:
            # UCSF pairs
            UCSF_temp2 = set(UCSF_temp.sort_values(re, ascending=False).head(i)['name'])
        
            # Make sets of pairs that are order agnostic 
            UCSFsort_list = list()
            for pair in UCSF_temp2:
                UCSFsort = pair.split(" (interacts with) ")
                UCSFsort = sorted(UCSFsort)
                UCSFsort_list.append(UCSFsort)
            
            UCSFsort_set = set(frozenset(pair) for pair in UCSFsort_list)
            
            # UCDDP pairs 
            UCDDP_temp2 = set(UCDDP_temp.sort_values(re, ascending=False).head(i)['name'])
        
            # Make sets of pairs that are order agnostic 
            UCDDPsort_list = list()
            for pair in UCDDP_temp2:
                UCDDPsort = pair.split(" (interacts with) ")
                UCDDPsort = sorted(UCDDPsort)
                UCDDPsort_list.append(UCDDPsort)
            
            UCDDPsort_set = set(frozenset(pair) for pair in UCDDPsort_list)
        
            top_coninteract_dict[re] = UCSFsort_set & UCDDPsort_set
            i +=1
    
        print(i)
        print('Top interactions for {}-identified patients: '.format(re))
        for pair in top_coninteract_dict[re]:
            print(pair)
        print('\n')
    # Need separate overlapping interactions because only 8 phenotypes found for White control patients that
    # are shared by at least 5% of patients
    else:
        i = 8
        # UCSF pairs
        UCSF_temp2 = set(UCSF_temp.sort_values(re, ascending=False).head(i)['name'])
        
        # Make sets of pairs that are order agnostic 
        UCSFsort_list = list()
        for pair in UCSF_temp2:
            UCSFsort = pair.split(" (interacts with) ")
            UCSFsort = sorted(UCSFsort)
            UCSFsort_list.append(UCSFsort)
            
        UCSFsort_set = set(frozenset(pair) for pair in UCSFsort_list)
            
        # UCDDP pairs 
        UCDDP_temp2 = set(UCDDP_temp.sort_values(re, ascending=False).head(i)['name'])
        
        # Make sets of pairs that are order agnostic 
        UCDDPsort_list = list()
        for pair in UCDDP_temp2:
            UCDDPsort = pair.split(" (interacts with) ")
            UCDDPsort = sorted(UCDDPsort)
            UCDDPsort_list.append(UCDDPsort)
            
        UCDDPsort_set = set(frozenset(pair) for pair in UCDDPsort_list)
        
        top_coninteract_dict[re] = UCSFsort_set & UCDDPsort_set
        i +=1
    
        print(i)
        print('Top interactions for {}-identified patients: '.format(re))
        for pair in top_coninteract_dict[re]:
            print(pair)
        print('\n')

In [None]:
# Overlapping interactions
overlap = top_coninteract_dict['Asian'] & top_coninteract_dict['Black'] & top_coninteract_dict['Latine'] & top_coninteract_dict['White']

In [None]:
print(len(overlap))
overlap

In [None]:
# Find top interactions found only in one identified race and ethnicity (if applicable)
for re1 in race_ethnicities:
    temp_overlap_dict = dict()
    j = 1
    for _, re2 in enumerate(race_ethnicities):
        if re1 != re2:
            temp_overlap_dict[j] = top_coninteract_dict[re1] - top_coninteract_dict[re2]
            j += 1
    # for the three comparisons, print which interactions specific to identified race and ethnicity
    temp_overlap = temp_overlap_dict[1] & temp_overlap_dict[2] & temp_overlap_dict[3] 
    print(len(temp_overlap))
    print('interactions specific to patients who identify as {}: {}'.format(re1, temp_overlap))
    print('\n')

### Save top pairs for each identified race and ethnicity into excel sheets, one for AD and another for control

In [None]:
def make_tables(top_pairs, tables, pct=pct_re_dict, res=race_ethnicities):
    
    """
    Parameters
    __________
    top_pairs : dict
        Contains top phenotype pairs; keys are a given identified race and ethnicity (string); values are top
        phenotye pairs for that identified race and ethnicity (set of comorbidity pairs, which are in a frozenset)
    tables : dict
        Empty dictionary that will contain top phenotype pairs for each racialized population that are shared
        between UCSF and UC-wide
    pct : dict
        Keys are identified race and ethnicitiesn (string); values are corresponding percentage columns for the 
        percentage of patients who have that comorbidity pair for each race and ethnicity (string)
    res : list
        List of identified race and ethnicities (string) in the study
        
    Returns
    _______
    Nothing; populates tables variable with pandas DataFrames containing top shared phenotype pairs
    between UCSF and UC-wide for each racialized population
    
    """
    
    for re in res:
        print(re)
        
        # Get relevant files
        UCSF_temp = pd.read_csv('Network_Analysis/ADCon_phe/edge_tables/'+re+'_AD_5_edge_table.csv')
        UCDDP_temp = pd.read_csv('Network_Analysis/UCDDP/ADCon_phe/edge_tables/'+re+'_AD_UCDDP_5_edge_table.csv')
        
        # Get pairs in alphabetical order:
        ordered = list()
        for one_pair in top_pairs[re]:
            alphabetical = sorted(list(one_pair))
            
            # Get UCSF percentage
            # https://stackoverflow.com/questions/23145928/python-and-pandas-how-to-access-a-column-using-iterrows
            for index, row in UCSF_temp.iterrows():
                if (row['source'] == alphabetical[0]) and (row['target'] == alphabetical[1]):
                    UCSF_pct = row[pct_re_dict[re]]
                elif (row['source'] == alphabetical[1]) and (row['target'] == alphabetical[0]):
                    UCSF_pct = row[pct_re_dict[re]]
                else: 
                    pass
            alphabetical.append(UCSF_pct)
            
            # Get UCDDP percentage
            for index, row in UCDDP_temp.iterrows():
                if (row['source'] == alphabetical[0]) and (row['target'] == alphabetical[1]):
                    UCDDP_pct = row[pct_re_dict[re]]
                elif (row['source'] == alphabetical[1]) and (row['target'] == alphabetical[0]):
                    UCDDP_pct = row[pct_re_dict[re]]
                else: 
                    pass
            alphabetical.append(UCDDP_pct)
            
            ordered.append(alphabetical)

        temp = pd.DataFrame(ordered, 
                            columns=['phenotype_1', 
                                     'phenotype_2', 
                                     pct_re_dict[re]+'_UCSF', 
                                     pct_re_dict[re]+'_UCDDP'])
        temp = temp.sort_values(by=pct_re_dict[re]+'_UCSF', ascending=False).reset_index(drop=True)
        temp['phenotype_pair'] = temp['phenotype_1'] + ' - ' + temp['phenotype_2']
        temp = temp[['phenotype_pair', pct_re_dict[re]+'_UCSF', pct_re_dict[re]+'_UCDDP']]
      
        tables[re+'-identified patients'] = temp
        display(temp)      

In [None]:
# Make Alzheimer's dataframes of top phenotype pairs for each identified race and ethnicity, 
# along with corresponding percentages of patients who have the pheotype pair at UCSF and UC-wide

ADpairs_xlsx = dict()
make_tables(top_pairs=top_ADinteract_dict, tables=ADpairs_xlsx)

In [None]:
# Make control dataframes of top phenotype pairs for each identified race and ethnicity, 
# along with corresponding percentages of patients who have the pheotype pair at UCSF and UC-wide

conpairs_xlsx = dict()
make_tables(top_pairs=top_coninteract_dict, tables=conpairs_xlsx)

In [None]:
# Top shared AD edges
# Write to excel file
# https://pandas.pydata.org/docs/reference/api/pandas.ExcelWriter.html
with pd.ExcelWriter("Tables/SuppData_6.xlsx") as writer:
    for re in ADpairs_xlsx:
        ADpairs_xlsx[re].to_excel(writer, sheet_name=re, index=False)

In [None]:
# Top shared control edges
# Write to excel file
# https://pandas.pydata.org/docs/reference/api/pandas.ExcelWriter.html
with pd.ExcelWriter("Tables/SuppData_7.xlsx") as writer:
    for re in conpairs_xlsx:
        conpairs_xlsx[re].to_excel(writer, sheet_name=re, index=False)