## Manhattan Plots 

In [None]:
import pandas as pd
import numpy as np
import math
import os
# display Pandas tables
# https://stackoverflow.com/questions/26873127/show-dataframe-as-table-in-ipython-notebook
from IPython.display import display

In [None]:
diagkeys = ['phenotype']
n = 'phenotype'

In [None]:
os.chdir("..")

In [None]:
%run -i setup_functions.py

In [None]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)
np.set_printoptions(threshold=50)

In [None]:
codemap3 = (['circulatory system', 'congenital anomalies', 'dermatologic',
             'digestive', 'endocrine/metabolic', 'genitourinary',
             'hematopoietic', 'infectious diseases', 'injuries & poisonings',
             'mental disorders', 'musculoskeletal', 'neoplasms', 'neurological',
             'pregnancy complications', 'respiratory', 'sense organs',
             'symptoms'])

In [None]:
%run -i plotting_functions.py

In [None]:
UCSF = pd.read_csv('Tables/PheDiff_RE.csv')
UCDDP = pd.read_csv('Tables/PheDiff_RE.csv')

In [None]:
UCSF.columns

Get top 9 phenotypes other than dementias phenotype with the lowest p-values for all four racialized populations and make manhattan plot

In [None]:
suffixes = ['_A', '_B', '_L', '_W']
race_ethnicities = ['Asian', 'Black', 'Latine', 'White']

In [None]:
top_phe_dict = dict()

# Remove Dementias phenotype
UCSF_temp = UCSF[UCSF['phenotype'] != 'Dementias']
UCDDP_temp = UCDDP[UCDDP['phenotype'] != 'Dementias']

for suffix, race_ethnicity in zip(suffixes, race_ethnicities):
    top_phe_dict[race_ethnicity] = set()
    i = 9
    while len(top_phe_dict[race_ethnicity]) < 9:
        UCSF_temp2 = set(UCSF_temp.sort_values('pvalue'+suffix).head(i)['phenotype'])
        UCDDP_temp2 = set(UCDDP_temp.sort_values('pvalue'+suffix).head(i)['phenotype'])
        top_phe_dict[race_ethnicity] = UCSF_temp2 & UCDDP_temp2
        i +=1

In [None]:
for re in top_phe_dict:
    print(re)
    print(top_phe_dict[re])
    print('\n')

In [None]:
# Overlapping phenotypes
overlap = top_phe_dict['Asian'] & top_phe_dict['Black'] & top_phe_dict['Latine'] & top_phe_dict['White']

In [None]:
# Find top phenotypes found only in one identified race and ethnicity (if applicable)
for re1 in race_ethnicities:
    temp_overlap_dict = dict()
    j = 1
    for _, re2 in enumerate(race_ethnicities):
        if re1 != re2:
            temp_overlap_dict[j] = top_phe_dict[re1] - top_phe_dict[re2]
            j += 1
    # for the three comparisons, print which phenotypes specific to identified race and ethnicity
    temp_overlap = temp_overlap_dict[1] & temp_overlap_dict[2] & temp_overlap_dict[3] 
    print('Phenotypes specific to patients who identify as {}: {}'.format(re1, temp_overlap))

In [None]:
# Add annotation rows
# Have to use map to 'apply' function based on index values
# source: https://stackoverflow.com/questions/20025325/apply-function-on-dataframe-index
for re, suffix in zip(top_phe_dict, suffixes):
    UCSF['annotate'+suffix] = UCSF['phenotype'].apply(lambda x: 1 if x in top_phe_dict[re] else 0)

Find cutoff between top and bottom plot to figure out where to add line breaks

In [None]:
# https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten
# round up to nearest ten
def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

# round down to nearest ten
def rounddown(x):
    return int(math.floor(x / 10.0)) * 10

Cutoff for top plot

In [None]:
for re, suffix in zip(top_phe_dict, suffixes):
    temp = UCSF.sort_values(by='-log_pvalue'+suffix, ascending=False)
    # Make sure top phenotype (with highest -log 10 p-value) is Alzheimer's disease
    # top phenotype for Black-identified patients is Demetias due to how p-values were calculated for infinity
    print(temp['phenotype'].iloc[0])
    print('-log_pvalue'+suffix+' top cutoff for top graph')
    print(temp['-log_pvalue'+suffix].iloc[0])
    cutoff_temp = roundup(temp['-log_pvalue'+suffix].iloc[0])
    print(cutoff_temp)
    print('\n')

In [None]:
for re, suffix in zip(top_phe_dict, suffixes):
    temp = UCSF.sort_values(by='-log_pvalue'+suffix, ascending=False)
    # Make sure top phenotype (with highest -log 10 p-value) is Alzheimer's disease
    # top phenotype for Black-identified patients is Demetias due to how p-values were calculated for infinity
    print(temp['phenotype'].iloc[0])
    print('-log_pvalue'+suffix+' bottom cutoff for top graph')
    print(temp['-log_pvalue'+suffix].iloc[0])
    cutoff_temp = rounddown(temp['-log_pvalue'+suffix].iloc[0])
    print(cutoff_temp)
    print('\n')

Cutoff for bottom plot

In [None]:
for re, suffix in zip(top_phe_dict, suffixes):
    temp = UCSF[UCSF['phenotype'].isin(top_phe_dict[re])].sort_values(by='-log_pvalue'+suffix, ascending=False)
    print(temp['phenotype'].iloc[0])
    print('-log_pvalue'+suffix+' cutoff for graph')
    print(temp['-log_pvalue'+suffix].iloc[0])
    cutoff_temp = roundup(temp['-log_pvalue'+suffix].iloc[0])
    print(cutoff_temp)
    print('\n')

In [None]:
save = True

In [None]:
if not os.path.isdir('Figures/Manhattan'):
    os.mkdir('Figures/Manhattan')

In [None]:
dims = (40,10)
figtype = 'pdf'
fig, ax = marker.mhat_RE(df=UCSF, 
                         logp='-log_pvalue_A',
                         chromo='icd10_chapter', 
                         suffix='_A',
                         dim=dims, 
                         rows=9, 
                         columns=2, 
                         nrowstop=2, # number of rows for top subplot
                         nrowsmid=6, # number of rows for middle subplot
                         topmin=250, # min y-axis value for top subplot
                         topmax=270, # max y-axis value for top subplot
                         mainmin=0, # min y-axis value for middle subplot
                         mainmax=60, # max y-axis value for middle subplot
                         yskip=50, 
                         gwas_sign_line=True, 
                         markernames=None, 
                         markeridcol='index',
                         plotlabelrotation=60, 
                         show=not save,
                         axlabelfontsize=31, 
                         gfont=6, 
                         dotsize=25, # for annotated, dotsize=18
                         axtickfontsize=25, 
                         gwasp=5.3705692803437166e-05, # bc value (0.05/931)
                         icd10_mapping=codemap3, 
                         ar=90,
                         figtype='pdf', 
                         figname="Alz-Con_miami_"+n, 
                         axxlabel='Phecode Category', 
                         annotatefontsize=22.25,
                         autoalign=False,
                         expand_text=(0.7, 0.5),
                         expand_points=(0.5, 0.5),
                         textcolor='#66C2A5',
                         figtitle='UCSF \n AD vs. Control Manhattan Plot \n Asian-identified patients',
                         overlap=overlap) # To bold common phenotypes

if save: 
    plt.savefig('Figures/Manhattan/Fig_4A_' + n + '.pdf', 
                bbox_inches='tight')
plt.show()

In [None]:
dims = (40,10)
figtype = 'pdf'
fig, ax = marker.mhat_RE(df=UCSF, 
                         logp='-log_pvalue_B',
                         chromo='icd10_chapter',
                         suffix='_B',
                         dim=dims, 
                         rows=10, 
                         columns=2, 
                         nrowstop=1, # number of rows for top subplot
                         nrowsmid=8, # number of rows for middle subplot
                         topmin=240, # min y-axis value for top subplot
                         topmax=250, # max y-axis value for top subplot
                         mainmin=0, # min y-axis value for middle subplot
                         mainmax=80, # max y-axis value for middle subplot
                         yskip=50, 
                         gwas_sign_line=True, 
                         markernames=None, 
                         markeridcol='index',
                         plotlabelrotation=60, 
                         show=not save,
                         axlabelfontsize=31, 
                         gfont=6, 
                         dotsize=25, # for annotated, dotsize=18 
                         axtickfontsize=25, 
                         gwasp=5.3705692803437166e-05, # bc value (0.05/931)
                         icd10_mapping=codemap3, 
                         ar=90,
                         figtype='pdf', 
                         figname="Alz-Con_miami_"+n, 
                         axxlabel='Phecode Category', 
                         annotatefontsize=22.25,
                         autoalign=False,
                         expand_text=(0.7, 0.5),
                         expand_points=(0.5, 0.5),
                         textcolor='#FC8D62',
                         figtitle='UCSF \n AD vs. Control Manhattan Plot \n Black-identified patients',
                         overlap=overlap)

if save: 
    plt.savefig('Figures/Manhattan/Fig_4B_' + n + '.pdf', 
                bbox_inches='tight')
plt.show()

In [None]:
dims = (40,10)
figtype = 'pdf'
fig, ax = marker.mhat_RE(df=UCSF, 
                         logp='-log_pvalue_L',
                         chromo='icd10_chapter', 
                         suffix='_L',
                         dim=dims, 
                         rows=10, 
                         columns=2, 
                         nrowstop=2, # number of rows for top subplot
                         nrowsmid=7, # number of rows for middle subplot
                         topmin=290, # min y-axis value for top subplot
                         topmax=310, # max y-axis value for top subplot
                         mainmin=0, # min y-axis value for middle subplot
                         mainmax=70, # max y-axis value for middle subplot
                         yskip=50, 
                         gwas_sign_line=True, 
                         markernames=None, 
                         markeridcol='index',
                         plotlabelrotation=60, 
                         show=not save,
                         axlabelfontsize=31, 
                         gfont=6, 
                         dotsize=25, # for annotated, dotsize=18
                         axtickfontsize=25, 
                         gwasp=5.3705692803437166e-05, # bc value (0.05/931)
                         icd10_mapping=codemap3, 
                         ar=90,
                         figtype='pdf', 
                         figname="Alz-Con_miami_"+n, 
                         axxlabel='Phecode Category', 
                         annotatefontsize=22.25,
                         autoalign=False,
                         expand_text=(0.7, 0.5),
                         expand_points=(0.5, 0.5),
                         textcolor='#8DA0CB',
                         figtitle='UCSF \n AD vs. Control Manhattan Plot \n Latine-identified patients',
                         overlap=overlap)


if save: 
    plt.savefig('Figures/Manhattan/Fig_4C_' + n + '.pdf', 
                bbox_inches='tight')
plt.show()

In [None]:
dims = (40,10)
figtype = 'pdf'
fig, ax = marker.mhat_RE(df=UCSF, 
                         logp='-log_pvalue_W',
                         chromo='icd10_chapter', 
                         suffix='_W',
                         dim=dims, 
                         rows=9, 
                         columns=2, 
                         nrowstop=2, # number of rows for top subplot
                         nrowsmid=6, # number of rows for middle subplot
                         topmin=300, # min y-axis value for top subplot
                         topmax=320, # max y-axis value for top subplot
                         mainmin=0, # min y-axis value for middle subplot
                         mainmax=60, # max y-axis value for middle subplot
                         yskip=50, 
                         gwas_sign_line=True, 
                         markernames=None,
                         markeridcol='index',
                         plotlabelrotation=60, 
                         show=not save,
                         axlabelfontsize=31, 
                         gfont=6, 
                         dotsize=25, # for annotated, dotsize=18
                         axtickfontsize=25, 
                         gwasp=5.3705692803437166e-05, # bc value (0.05/931)
                         icd10_mapping=codemap3, 
                         ar=90,
                         figtype='pdf', 
                         figname="Alz-Con_miami_"+n, 
                         axxlabel='Phecode Category', 
                         annotatefontsize=22.25,
                         autoalign=False,
                         expand_text=(0.7, 0.5),
                         expand_points=(0.5, 0.5),
                         textcolor='#E78AC3',
                         figtitle='UCSF \n AD vs. Control Manhattan Plot \n White-identified patients',
                         overlap=overlap)

if save: 
    plt.savefig('Figures/Manhattan/Fig_4D_' + n + '.pdf', 
                bbox_inches='tight')
plt.show()