## Fig 5

In [None]:
import pandas as pd
import numpy as np
import os
import statsmodels.stats
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

In [None]:
pd.set_option('display.max_rows', 150)

In [None]:
save = True

### Import data

In [None]:
# All comparisons
all_UCSF = pd.read_csv('Tables/PheDiff_RE.csv')
# Remove significance column for UCSF
all_UCSF = all_UCSF.iloc[:, :-1]

all_UCDDP = pd.read_csv('Tables/PheDiff_RE_UCDDP.csv')
# Change column names for -log_pvalue_RE so that underscore is removed 
for column in all_UCDDP.columns:
    if column[0] == '_':
        all_UCDDP = all_UCDDP.rename({column: column[1:]}, axis=1)

In [None]:
all_UCSF['Sig_A'].value_counts()

In [None]:
all_UCSF.shape

In [None]:
all_UCDDP.shape

In [None]:
all_UCSF.columns == all_UCDDP.columns

In [None]:
both = all_UCSF.merge(all_UCDDP, left_on='phenotype', right_on='phenotype', suffixes=('_UCSF', '_UCDDP'))

In [None]:
both.shape

In [None]:
# Verified all phenotypes in UCSF are found in UCDDP
# UCDDP phenotypes will be used to measure correlation
all_UCDDP[all_UCDDP['phenotype'].isin(all_UCSF['phenotype'])].shape

# all_UCSF will now only have phenotypes that are present in UCDDP
all_UCSF = all_UCSF[all_UCSF['phenotype'].isin(all_UCDDP['phenotype'])]

### Phenotypes significant for patients only at UCSF and both at UCSF and UC-wide

In [None]:
suffixes = ['_A_AD', '_B_AD', '_L_AD', '_W_AD']
race_ethnicities = ['Asian', 'Black', 'Latine', 'White']

In [None]:
# Make a significant in both column for each race and ethnicity
def sig_overlap(dataframe):
    """
    Parameters
    __________
    dataframe : pandas DataFrame
        Contains Significance column for phenotypes
    
    Returns
    _______
    dataframe : pandas DataFrame
        Contains new column indicating significance of phenotype at UCSF,
        UC-wide, or both
    """
    
    suffixes = ['_A_AD', '_B_AD', '_L_AD', '_W_AD']
    
    for suffix in suffixes:
        if (dataframe['Sig'+suffix+'_UCSF'] == True) & (dataframe['Sig'+suffix+'_UCDDP'] == True):
            dataframe['Significance'+suffix] = 'UCSF and UC-wide'
        elif dataframe['Sig'+suffix+'_UCSF'] == True:
            dataframe['Significance'+suffix] = 'UCSF only'
        elif dataframe['Sig'+suffix+'_UCDDP'] == True:
            dataframe['Significance'+suffix] = 'UCDDP only'
        else:
            dataframe['Significance'+suffix] = 'Neither'
    
    return dataframe

In [None]:
both = both.apply(sig_overlap, axis=1)

In [None]:
# Get info on number of significant phenotypes only at UCSF and both at UCSF and UC-wide
# in addition to percentage of significant phenotypes at UCSF that are validated in UCDDP
for suffix, race_ethnicity in zip(suffixes, race_ethnicities):
    UCSF_num = both[both['Significance'+suffix] == 'UCSF only'].shape[0]
    overlap_num = both[both['Significance'+suffix] == 'UCSF and UC-wide'].shape[0]
    total_num = UCSF_num + overlap_num
            
    pct_overlap = (overlap_num / (UCSF_num + overlap_num)) * 100
    pct_overlap = round(pct_overlap, 2)
    
    print('For patients who identify as {}, {} phenotypes were found to be significant at UCSF'.format(race_ethnicity,
                                                                                                       total_num) + \
          ' and {} of these phenotypes were also found in UCDDP, validating {} percent'.format(overlap_num,
                                                                                              pct_overlap) + \
          ' of significant phenotypes found at UCSF. \n ')

**Correlation results**

In [None]:
both.columns

In [None]:
suffixes_2 = ['_A', '_B', '_L', '_W']

In [None]:
# For phenotypes significant for UCSF that are validated in UCDDP:
print('There are for phentoypes significant at UCSF and UC-wide. \n')
for suffix, suffix_2, race_ethnicity in zip(suffixes, suffixes_2, race_ethnicities):
    temp = both[both['Significance'+suffix] == 'UCSF and UC-wide']
    log2_UCSF = temp['log2_oddsratio'+suffix_2+'_UCSF'].to_list()
    log2_UCDDP = temp['log2_oddsratio'+suffix_2+'_UCDDP'].to_list()
    
    spearman = spearmanr(log2_UCSF, log2_UCDDP)
    
    print('Spearman rank correlation coefficient result for patients who identify as {}: {} \n'.format(race_ethnicity,
                                                                                                       spearman))

In [None]:
# For phenotypes significant for UCSF only:
print('There are for phentoypes significant at UCSF only. \n')
for suffix, suffix_2, race_ethnicity in zip(suffixes, suffixes_2, race_ethnicities):
    temp = both[both['Significance'+suffix] == 'UCSF only']
    log2_UCSF = temp['log2_oddsratio'+suffix_2+'_UCSF'].to_list()
    log2_UCDDP = temp['log2_oddsratio'+suffix_2+'_UCDDP'].to_list()
    
    spearman = spearmanr(log2_UCSF, log2_UCDDP)
    
    print('Spearman rank correlation coefficient result for patients who identify as {}: {} \n'.format(race_ethnicity,
                                                                                                       spearman))

20220421 Determine number of phenotypes significant at UCSF to see percentage of what mapped UC-wide

In [None]:
for suffix, race_ethnicity in zip(suffixes, race_ethnicities):
    temp_UCSF_UCDDP = both[(both['Significance'+suffix] == 'UCSF and UC-wide') | (both['Significance'+suffix] == 'UCSF only')]
    temp_UCSF = all_UCSF[all_UCSF['Sig'+suffix] == True]
    print('# of phenotypes significant for patients with AD at UCSF: {}'.format(temp_UCSF.shape[0]))
    print('# of significant phenotypes mapped UC-wide: {}'.format(temp_UCSF.shape[0]))
    
    temp_pct = (temp_UCSF_UCDDP.shape[0] / temp_UCSF.shape[0]) * 100
    temp_pct = round(temp_pct, 2)
    print('% of significant phenotypes mapped UC-wide: {}%'.format(temp_pct))
    print('\n')

20220421 Determine number of phenotypes significant at UCSF and number of significant phenotypes at UCSF validated UC-wide

In [None]:
for suffix, race_ethnicity in zip(suffixes, race_ethnicities):
    temp_UCSF_UCDDP = both[(both['Significance'+suffix] == 'UCSF and UC-wide') | (both['Significance'+suffix] == 'UCSF only')]
    temp_val = both[(both['Significance'+suffix] == 'UCSF and UC-wide')]
    print('# of phenotypes significant for patients with AD who identify as {} at UCSF: {}'.format(race_ethnicity,
                                                                                                   temp_UCSF_UCDDP.shape[0]))
    print('# of these significant phenotypes that validated UC-wide: {}'.format(temp_val.shape[0]))
    temp_pct = (temp_val.shape[0] / temp_UCSF_UCDDP.shape[0]) * 100
    temp_pct = round(temp_pct, 2)
    print('% of phenotypes validated UC-wide: {}%'.format(temp_pct))
    print('\n')

**log-log plots**

In [None]:
save = True

20221216 [Changing axes ticks' labels' font sizes](https://stackoverflow.com/questions/6390393/matplotlib-make-tick-labels-font-size-smaller)

In [None]:
#https://stackoverflow.com/questions/21321670/how-to-change-fonts-in-matplotlib-python
#kwargs = {'fontname' : 'Arial'}

plt.rcParams.update({'font.family':'sans-serif'})
plt.rcParams.update({'font.sans-serif':'Arial'})

for suffix, suffix_2, race_ethnicity in zip(suffixes, suffixes_2, race_ethnicities):
    temp = both[(both['Significance'+suffix] == 'UCSF and UC-wide') | (both['Significance'+suffix] == 'UCSF only')]
    temp = temp.rename({'Significance'+suffix : 'Significance'}, axis=1)
    
    plt.figure()
    ax = sns.scatterplot(x='log2_oddsratio'+suffix_2+'_UCSF', 
                         y='log2_oddsratio'+suffix_2+'_UCDDP', 
                         data=temp,
                         hue='Significance',
                         linewidth=0.1, 
                         alpha=0.55,
                         s=25)
    ax.set_xlabel('$\mathregular{log_2}$(UCSF OR: AD/Control) \n' + race_ethnicity + '-identified patients', fontsize=18)
    ax.set_ylabel('$\mathregular{log_2}$(UC-wide OR: AD/Control) \n' + race_ethnicity + '-identified patients', fontsize=18)
    
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(16)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(16)
        
    plt.legend(fontsize='medium')

    if save:
        if os.path.isdir('Figures/comp'):
            ax.figure.savefig('Figures/comp/'+race_ethnicity+'_loglog.pdf', 
                              bbox_inches='tight',
                              dpi=300)
        else:
            os.mkdir('Figures/comp')
            ax.figure.savefig('Figures/comp/'+race_ethnicity+'_loglog.pdf', 
                              bbox_inches='tight',
                              dpi=300)