## Phenotype Differential Analysis

Each phecode corresponds to one phenotype only.

Comparing phenotypes across four racialized populations: patients who identify as Asian, (Non-Hispanic) Black or African American, Latine (Latinx in UCSF db) and (Non-Hispanic) White or Caucasian

In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import upsetplot # for comparing all four racialized populations
import numpy as np
import os
from scipy.stats import norm
from scipy.stats import chi2_contingency
import scipy.stats as stats
from math import log10, log2
from tqdm import tqdm
import warnings
# 20211102 Adding in statsmodels oddsratio_confint
import statsmodels.stats.contingency_tables 
import statsmodels.api as sm
# adding itertools
from itertools import combinations
warnings.filterwarnings("ignore", category=FutureWarning) 
# display dataframes
from IPython.display import display

In [None]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)
np.set_printoptions(threshold=50)

In [None]:
diagkeys = ['phenotype']

In [None]:
%run -i setup_functions.py

In [None]:
os.getcwd()

In [None]:
# Change default font to Arial
plt.rcParams.update({'font.family':'sans-serif'})
plt.rcParams.update({'font.sans-serif':'Arial'})

# Import data

In [None]:
total_ad = 7409 # Total patients with AD
total_con = 7409*2 # Total control patients

In [None]:
# Specifically Diagnoses Names
ad_diag_all = pd.read_csv('Diagnoses/phecode_diagnoses/ad_diagnoses.csv')

con_diag_all = pd.read_csv('Diagnoses/phecode_diagnoses/con_diagnoses.csv')

# demographics
ad_demo = pd.read_csv('Demographics/ad_demographics.csv')
con_demo = pd.read_csv('Demographics/con_demographics.csv')

In [None]:
# add column that indicates order icd10_chapter
# NOTE: icd10_chapter ROUGHLY corresponds to icd-10 chapters, and some chapters are not included
ad_diag_all['chp_order'] = ad_diag_all['icd10_chapter'].apply(ICDname_order)
con_diag_all['chp_order'] = con_diag_all['icd10_chapter'].apply(ICDname_order)

In [None]:
# Only keep diagnoses mapped to phecodes that are organized into ICD-10 inspired chapters
ad_diag = ad_diag_all[~ad_diag_all['icd10_chapter'].isnull()]
con_diag = con_diag_all[~con_diag_all['icd10_chapter'].isnull()]

In [None]:
ad_diag_null = ad_diag_all[ad_diag_all['icd10_chapter'].isnull()]

#### Check that phecodes correspond to one phenotype each

In [None]:
ad_diag['phecode'].drop_duplicates().shape

In [None]:
ad_diag['phenotype'].drop_duplicates().shape

In [None]:
con_diag['phecode'].drop_duplicates().shape

In [None]:
con_diag['phenotype'].drop_duplicates().shape

#### Comparing shapes before and after removal of NaN icd10_chapter rows:

In [None]:
ad_diag_all.shape

In [None]:
ad_diag.shape

In [None]:
# percent of rows retained:
round(((ad_diag.shape[0] / ad_diag_all.shape[0]) * 100), 2)

In [None]:
con_diag_all.shape

In [None]:
con_diag.shape

In [None]:
# percent of rows retained: (keeping in mind 7,208 rows are NaN; with this in mind, it's 93.79)
round(((con_diag.shape[0] / con_diag_all.shape[0]) * 100), 2)

#### Only keep the following columns: 'person_id', 'PatientDurableKey', 'phecode', 'phenotype', 'icd10_chapter'

In [None]:
ad_diag = ad_diag[['person_id', 
                   'PatientDurableKey',
                   'phecode',
                   'phenotype',
                   'icd10_chapter']].copy().drop_duplicates()
con_diag = con_diag[['person_id', 
                     'PatientDurableKey',
                     'phecode',
                     'phenotype',
                     'icd10_chapter']].copy().drop_duplicates()

**Only keep phenotypes found for UCSF patients with AD**

In [None]:
common_phenotypes_UCSF_AD_df = pd.read_csv('Tables/common_phenotypes_UCSF_AD.csv')
common_phenotypes_UCSF_AD = common_phenotypes_UCSF_AD_df['common_phenotypes_UCSF_AD']
ad_diag = ad_diag[ad_diag['phenotype'].isin(common_phenotypes_UCSF_AD)]
con_diag = con_diag[con_diag['phenotype'].isin(common_phenotypes_UCSF_AD)]

AD patients' information

In [None]:
ad_diag['phenotype'].value_counts()

In [None]:
ad_diag.info()

control patients' information

In [None]:
con_diag['phenotype'].value_counts()

In [None]:
con_diag.info()

In [None]:
# ad_diag retains all patients; con_diag does not

# Number of unique AD patients:
ad_diag['person_id'].drop_duplicates().shape[0]

In [None]:
# Number of unique control patients: over half were removed this way (50.3%)
con_diag['person_id'].drop_duplicates().shape[0]

In [None]:
# Merge con_demo info to retain the remaining patients:
con_diag = con_demo['person_id'].to_frame().merge(con_diag,
                                                  how='left',
                                                  on='person_id')

In [None]:
# Check that all patients now included
con_diag['person_id'].drop_duplicates().shape[0]

In [None]:
ad_demo.info()

In [None]:
con_demo.info()

<script type="text/javascript"
        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_CHTML"></script>
        
\\[ OddsRatio=\frac{{dz_{Alzheimer}}/{notdz_{Alzheimer}}}{dz_{Control} / notdz_{Control}} \\]



In [None]:
codemap3 = (['circulatory system', 'congenital anomalies', 'dermatologic',
             'digestive', 'endocrine/metabolic', 'genitourinary',
             'hematopoietic', 'infectious diseases', 'injuries & poisonings',
             'mental disorders', 'musculoskeletal', 'neoplasms', 'neurological',
             'pregnancy complications', 'respiratory', 'sense organs',
             'symptoms'])

rand_colors = ('#a7414a', '#282726', '#6a8a82', '#a37c27', '#563838', '#0584f2', '#f28a30', '#f05837',
               '#6465a5', '#00743f', '#be9063', '#de8cf0', '#888c46', '#c0334d', '#270101', '#8d2f23',
               '#ee6c81', '#65734b', '#14325c', '#704307', '#b5b3be', '#f67280', '#ffd082', '#ffd800',
               '#ad62aa', '#21bf73', '#a0855b', '#5edfff', '#08ffc8', '#ca3e47', '#c9753d', '#6c5ce7')

In [None]:
%run -i plotting_functions.py

## Diagnosis Differences in Racialized Populations

## Include UCSFDerivedRaceEthnicity_Clean column to ad_diag and con_diag dataframes

In [None]:
# Add race/ethnicity column to ad_diag df
ad_diag = ad_diag.merge(ad_demo[['person_id', 'UCSFDerivedRaceEthnicity_Clean']], 
                        how='left', 
                        left_on='person_id', 
                        right_on='person_id')

In [None]:
# Add race/ethnicity column to con_diag df
con_diag = con_diag.merge(con_demo[['person_id', 'UCSFDerivedRaceEthnicity_Clean']], 
                          how='left', 
                          left_on='person_id', 
                          right_on='person_id')

In [None]:
ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique()

### Only keep MatchIt person_ids for ad_diag and con_diag

In [None]:
# Get person_ids for MatchIt Alzheimer's and Control patients
ad_MatchIt = pd.read_csv('Demographics/RE_MI_ad_demo.csv')
con_MatchIt = pd.read_csv('Demographics/RE_MI_con_demo.csv')

In [None]:
# 422 patients each
ad_MatchIt.shape

In [None]:
# 844 patients each
con_MatchIt.shape

In [None]:
ad_diag = ad_diag[ad_diag['person_id'].isin(ad_MatchIt['person_id'])]
con_diag = con_diag[con_diag['person_id'].isin(con_MatchIt['person_id'])]

### Stratify diagnosis dataframes by race/ethnicity

In [None]:
UCSFDerivedRaceEthnicity_Clean = ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique()
print('Racialized populations in analysis: ', UCSFDerivedRaceEthnicity_Clean)
print('\n')
equalize_num = False # Equalize number between all patients
randomstate = 40

# Stratify diagnosis dataframes by self-reported race and ethnicity
ad_diag_re_dict = dict()
con_diag_re_dict = dict()

# Count number of patients in each stratified self-reported race and ethnicity
num_ad_re_dict = dict()
num_con_re_dict = dict()

for re in UCSFDerivedRaceEthnicity_Clean:
    # Diagnosis dataframes
    ad_diag_re_dict[re+'_AD'] = ad_diag[ad_diag['UCSFDerivedRaceEthnicity_Clean'] == re]
    con_diag_re_dict[re+'_con'] = con_diag[con_diag['UCSFDerivedRaceEthnicity_Clean'] == re]
    
    # Number of patients for each self-reported race and ethnicity
    num_ad_re_dict[re+'_AD'] = ad_diag_re_dict[re+'_AD'][['person_id',
                                                           'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates().shape[0]
    num_con_re_dict[re+'_con'] = con_diag_re_dict[re+'_con'][['person_id',
                                                              'UCSFDerivedRaceEthnicity_Clean']].drop_duplicates().shape[0]

print('Number of AD patients stratified by self-reported race and ethnicity: \n',
      num_ad_re_dict)
print('\n')
print('Number of control patients stratified by self-reported race and ethnicity: \n', 
      num_con_re_dict)
print('\n')

# Get the lower number as our number to sample
kNumPatientsSampled = min((num_ad_re_dict.values())) 

### Count diagnosis

In [None]:
# Count diagnosis
def getDiagnosisCountsStratify(stratvarname, 
                               stratvars, 
                               diagdf=None, 
                               diagdict=None, 
                               numptsvar=None, 
                               equalize_num=False, 
                               random_state=40, 
                               AD=None):
    
    """
    Parameters
    __________
    stratvarname : string
        Name of stratifying variable (e.g., identified race and ethnicity)
    stratvars : dictionary
        Possible stratifying names (e.g. ['Asian','Black', 'Latine', 'White'])
    diagdf : pandas DataFrame (default None)
        Contains diagnoses. Must be passed if diagdict is not passed
    diagdict : dictionary (default None)
        Contains diagnoses. Must be passed if diagdf is not passed
    numptsvar : dictionary
        Number of patients with stratification
    equalize_num : bool (default False)
        Equalizes the number across stratified categories if set to True
    random_state : int (default 40)
        Seeds random numbe of patients subsampled if equalize_num is set to True
    AD : bool
        Whether stratified dataframes are of patients with AD
        If True, assumes these are patients with AD
        If False, assumes these are control patients
    

    Returns
    _______
    stratdiagcount : dictionary
        Dictionary indexed by stratified variable with dataframe of counted diagnosis.
    """

    if AD == True:
        for i, var in enumerate(stratvars):
            stratvars[i] = var+'_AD'
    elif AD == False:
        for i, var in enumerate(stratvars):
            stratvars[i] = var+'_con'
    else:
        raise Exception('Cannot tell whether patients are AD or control, please correct.')
        
    diagdictvar = dict()
    if diagdict: 
        diagdictvar = diagdict;
        if numptsvar is None:
            raise Exception('variable numptsvar is empty, pass in dictionary with number of patients in each category.')
    elif diagdf:
        numptsvar = dict()
        for var in stratvars:
            diagdictvar[var] = diagdf[diagdf[stratvarname] == var]
            numptsvar[var] = diagdictvar[var][['person_id' , stratvarname]] \
                             .drop_duplicates().shape[0]
    else:
        raise Exception('did not pass in full dataframe or dictionary of stratified dataframes.')
        
    kMinPatientsSampled = min(numptsvar.values())
            
    stratdiagcount = dict()
    for var in stratvars:
        stratdiagcount[var] = dict()
        if numptsvar[var]>kMinPatientsSampled and equalize_num:
            subsampledPatientKeys = diagdictvar[var]['person_id'].drop_duplicates()\
                                    .sample(kMinPatientsSampled, random_state=random_state)
            diagdictvar_s = diagdictvar[var][diagdictvar[var]['person_id'].isin(subsampledPatientKeys)]
            stratdiagcount[var] = countPtsDiagnosis_Dict(diagdictvar_s, kMinPatientsSampled)
        else: 
            stratdiagcount[var] = countPtsDiagnosis_Dict(diagdictvar[var], numptsvar[var])
    return stratdiagcount

# Get counts for each diagnosis by race/ethnicity for alzheimers cohort
ad_diagcountre = getDiagnosisCountsStratify('UCSFDerivedRaceEthnicity_Clean', 
                                            stratvars=['Asian', 
                                                       'Black or African American', 
                                                       'Latinx', 
                                                       'White or Caucasian'],
                                            diagdict=ad_diag_re_dict, 
                                            numptsvar=num_ad_re_dict, 
                                            equalize_num=equalize_num, 
                                            random_state=randomstate,
                                            AD=True)        

# Get counts for each diagnosis by race/ethnicity for control cohorts
con_diagcountre = getDiagnosisCountsStratify('UCSFDerivedRaceEthnicity_Clean', 
                                             stratvars=['Asian', 
                                                       'Black or African American', 
                                                       'Latinx', 
                                                       'White or Caucasian'], 
                                             diagdict=con_diag_re_dict, 
                                             numptsvar=num_con_re_dict,
                                             equalize_num=equalize_num, 
                                             random_state=randomstate,
                                             AD=False)

### Make contingency tables

In [None]:
# create contingency table for various comparisons

# Asian, Black, Latine, and White-identified AD patients compared with respectively controls
comparisons = ['ADCon_A', 'ADCon_B', 'ADCon_L', 'ADCon_W']
alldiagcountre = dict()
for comp in comparisons:
    alldiagcountre[comp] = dict()
jointype = 'outer'
num_threshold = 0

n = 'phenotype' 


for n in diagkeys:
    for ad, con, comp in zip(ad_diagcountre.keys(), con_diagcountre.keys(), comparisons):
        diagtemp = ad_diagcountre[ad][n] \
                   .merge(con_diagcountre[con][n], 
                   how=jointype, 
                   on=n, 
                   suffixes=('_AD_'+comp[-1],'_con_'+comp[-1]))
        alldiagcountre[comp][n] = diagtemp.set_index(n)
        alldiagcountre[comp][n] = alldiagcountre[comp][n] \
                                  [(alldiagcountre[comp][n]<num_threshold).sum(axis=1) < 1 ]
        if jointype == 'outer':
            nanreplace = dict(zip(list(alldiagcountre[comp][n].columns),
                              [0,num_ad_re_dict[ad],0,num_con_re_dict[con]]))
            alldiagcountre[comp][n] = alldiagcountre[comp][n].fillna(value=nanreplace)

### Do statistical testing (Fisher's Exact Test and Chi Squared)

In [None]:
def sigTestDiagCountRE2(alldiagcountre, comp, n, verbose=False): 
    """
    Parameters
    __________
    alldiagcountre : dictionary
        Keys denote stratified racialized population 
        Values correspond to dictionary of AD and control counts for
        each phenotype, where key is phenotype and value is pandas DataFrame
        containing counts
    comp : string
        Specifcies which stratified population is being compared
    n : string
        Corresponds to 'phenotype' in this study
    verbose : bool (default False)
        When True, prints out detailed output of tests if set to True

    Returns
    _______
    combined : pandas DataFrame
        Contains phenotype differential analysis results for a given stratified population
    """
   
    # First, for fischer exact test choose rows with less than 5 patients in a category
    print(comp, n,': Number Diagnosis: ', alldiagcountre[comp][n].shape[0])
    
    temp_less5 = alldiagcountre[comp][n][alldiagcountre[comp][n].min(axis=1)<5] # take all with counts less than 5
    fisher1 = pd.DataFrame()
    if temp_less5.shape[0]>0:
        print('\t Fisher Exact for <5 pts in a category, num diagnosis:', temp_less5.shape[0])
        fisher = temp_less5 \
                 .apply(lambda x: stats.fisher_exact(np.array(x).reshape(2,2)), axis=1) \
                 .apply(pd.Series)
        fisher.columns = ['OddsRatio', 'pvalue']
        if verbose: print('fisher:', fisher.shape)

        maxratio = fisher['OddsRatio'][fisher['OddsRatio']<np.inf].max()
        minratio = fisher['OddsRatio'][fisher['OddsRatio']>0].min()
        fisher = fisher.replace(np.inf, maxratio+1) 
        fisher['log2_oddsratio'] = fisher['OddsRatio']\
                                   .apply(lambda x: log2(minratio/2) if (x==0) else log2(x))

        minpvalue = fisher['pvalue'][fisher['pvalue']>0].min();
        fisher['pvalue'] = fisher['pvalue'].replace(0,minpvalue/2)
        fisher['-log_pvalue'] = fisher['pvalue'].apply(lambda x: -log10(x))

        fisher1 = fisher.merge(temp_less5, how='right', left_index=True, right_index=True)
        if verbose: print('fisher1',fisher1.shape)

    # now take the rest of the patients
    temp_more5 = alldiagcountre[comp][n][alldiagcountre[comp][n].min(axis=1)>=5]
    print('\n \t Chi square for >=5 pts in a category, num diagnosis:', temp_more5.shape[0])

    fisher =  temp_more5 \
              .apply(lambda x: stats.fisher_exact(np.array(x).reshape(2,2)), axis = 1) \
              .apply(pd.Series)
    fisher.columns = ['OddsRatio', 'fpvalue']
    
    maxratio = fisher['OddsRatio'][fisher['OddsRatio']<np.inf].max();
    minratio = fisher['OddsRatio'][fisher['OddsRatio']>0].min();
    fisher = fisher.replace(np.inf, maxratio+1) 
    fisher['log2_oddsratio'] = fisher['OddsRatio']\
                               .apply(lambda x: log2(minratio) if (x==0) else log2(x))
    minpvalue = fisher['fpvalue'][fisher['fpvalue']>0].min();
    fisher['fpvalue'] = fisher['fpvalue'].replace(0,minpvalue/2)
    fisher['-log_fpvalue'] = fisher['fpvalue'].apply(lambda x: -log10(x))
    if verbose: print('fisher',fisher.shape)

    chisquare = temp_more5.apply(lambda x: \
                           chi2_contingency(np.array(x).reshape(2,2)), axis=1) \
                           .apply(pd.Series)
    chisquare.columns = ['chistat','pvalue','dof','expected']
    chisquare = chisquare.merge(temp_more5, how='right',left_index=True, right_index=True)
    minpvalue = chisquare['pvalue'][chisquare['pvalue']>0].min();
    chisquare['pvalue'] = chisquare['pvalue'].replace(0,minpvalue/2)
    chisquare['-log_pvalue'] = chisquare['pvalue'].apply(lambda x: -log10(x))
    if verbose: print('chisquare:', chisquare.shape)

    combined = chisquare.merge(fisher, left_index=True, right_index=True, how='left')
    combined = combined.append(fisher1)
    if verbose: 
        print('combined 1:', combined.shape)
        
    # Odds ratio 95% confidence interval
    temp = combined.filter(regex=('Count_*'))
    print('temp columns (should only have counts): ', temp.columns)
    or_ci = temp \
            .apply(lambda x: statsmodels.stats.contingency_tables.Table2x2(np.array(x).reshape(2,2)).oddsratio_confint(), 
                   axis=1).apply(pd.Series)
    # OddsRatio_CI_lb - confidence interval lower bound; 
    # OddsRatio_CI_ub - confidence interval upper bound
    or_ci.columns = ['OddsRatio_CI_lb', 'OddsRatio_CI_ub']
    combined = combined.merge(or_ci, how='left', left_index=True, right_index=True)
    
    temp = ad_diag[[n,'icd10_chapter']].append(con_diag[[n, 'icd10_chapter']]).drop_duplicates() # get mapping between diagnosis and category
    temp = temp[temp['icd10_chapter'] != 'NaN'].groupby(n)['icd10_chapter'].apply(list)
    combined = combined.merge(temp, how='left', left_index=True, right_index=True, suffixes=(False, False))
    if verbose: print('combined 2:', combined.shape, combined.columns)

    print('\t Final diagnosis num: ', combined.shape[0])
    
    print('\n \n')
    
    return combined

sigtestcountre = dict()      
for comp in comparisons:
    sigtestcountre[comp]=dict()
    for n in diagkeys:
        sigtestcountre[comp][n] = sigTestDiagCountRE2(alldiagcountre, comp, n)

In [None]:
suffixes = ['_A', '_B', '_L', '_W']

In [None]:
re_dfs = [sigtestcountre['ADCon_A'][n].sort_index(),
          sigtestcountre['ADCon_B'][n].sort_index(),
          sigtestcountre['ADCon_L'][n].sort_index(),
          sigtestcountre['ADCon_W'][n].sort_index()]

# Add self-reported race and ethnicity specific suffixes to dataframes
for i, suffix in enumerate(suffixes):
    mod_columns = dict()
    for column in re_dfs[i].columns:
        if column[-2:] != suffix:
            mod_columns[column] = column+suffix
    re_dfs[i] = re_dfs[i].rename(columns=mod_columns).copy()

### Combine race/ethnicity stratified results and encode signficance

In [None]:
# columns without suffixes:
cols_general = list()

for column in re_dfs[0].columns:
    cols_general.append(column[:-2])

In [None]:
for i, suffix in enumerate(suffixes):
    # keep the following columns for each dataframe:
    cols_keep = ['pvalue', 
                 '-log_pvalue',
                 'Count_AD', 
                 'Count_con',  
                 'OddsRatio', 
                 'OddsRatio_CI_lb', 
                 'OddsRatio_CI_ub',
                 'log2_oddsratio']
    for j, column in enumerate(cols_keep):
        cols_keep[j] = column+suffix
    # _W dataframe will also add the icd10_chapter column to be kept and renamed from
    # icd10_chapter_W to icd10_chapter
    if suffix == '_W':
        re_dfs[i] = re_dfs[i][cols_keep + ['icd10_chapter_W']]
        re_dfs[i] = re_dfs[i].rename(columns={'icd10_chapter_W' : 'icd10_chapter'})
    else:
        re_dfs[i] = re_dfs[i][cols_keep]

In [None]:
chitestRE = dict()
chitestRE[n] = pd.concat(re_dfs, axis=1)

In [None]:
sig_cols = ['Sig_A', 'Sig_B', 'Sig_L', 'Sig_W']
for k in range(5):
    sig_combo_temp = list(combinations(sig_cols, k))
    print(sig_combo_temp)
    print('\n')

In [None]:
def stratified_significance(df, suffixes):
    """
    Parameters
    __________
    df : pandas DataFrame
        Contains phenotype differential analysis results
    suffixes : list
        Suffixes for each racialized population (_A, _B, _L, _W)

    Returns
    _______
    df : pandas DataFrame
        Contains 12 new columns with significance for each phenotype,
        stratified by identified race and ethnicity
        'Sig_A', 'Sig_B', 'Sig_L', 'Sig_W' : bool
            Whether phenotype is signficantly associated with either
            1. patients with AD OR
            2. control patients
        'Oddsratio_A_AD', 'Oddsratio_B_AD', 'Oddsratio_L_AD', 'Oddsratio_W_AD' : bool
            Whether Odds Ratio is > 1
            If True, then phenotype is associated with patients with AD
        'Sig_A_AD', 'Sig_B_AD', 'Sig_L_AD', 'Sig_W_AD' : bool
            If true, then phenotype is significantly associatd with patients with AD
    """
    
    # Encode stratified significance
    # Bonferroni correction
    for n in diagkeys:
        bc = .05/chitestRE[n].shape[0] #Bonferroni correction
        print(n, bc)
    
    # Encode stratified significance
    for suffix in suffixes:
        df['Sig'+suffix] = (chitestRE[n]['pvalue'+suffix] < bc).to_numpy()
        df['OddsRatio'+suffix+'_AD'] = (chitestRE[n]['OddsRatio'+suffix] > 1).to_numpy()
        df['Sig'+suffix+'_AD'] = np.logical_and(df['Sig'+suffix], df['OddsRatio'+suffix+'_AD'])     

    return df

In [None]:
chitestRE[n] = stratified_significance(chitestRE[n], suffixes)

In [None]:
def overall_significance(df):
    """
    Parameters
    __________
    df : pandas DataFrame
        Contains the columns 'Sig_A_AD', 'Sig_B_AD', 'Sig_L_AD', 'Sig_W_AD'

    Returns
    _______
    significance : tuple
        Contains the racialized populations that have a significant 
        AD-associated comorbidity for a given phenotype. 
    """
    # input: df containing 'Sig_A_AD', 'Sig_B_AD', 'Sig_L_AD' and 'Sig_W_AD'
    significance = list()
    suffixes = ['_A', '_B', '_L', '_W']
    for suffix in suffixes:
        if df['Sig'+suffix+'_AD'] == True:
            significance.append('Sig'+suffix+'_AD')
    significance = tuple(significance)
    return significance

In [None]:
chitestRE[n]['Significance'] = chitestRE[n].apply(overall_significance, axis=1)

In [None]:
chitestRE[n].columns

In [None]:
# Save
base = os.getcwd()
chitestRE[n].to_csv(base + "\\Tables\\PheDiff_RE.csv")
chitestRE[n].to_csv(base + "\\Tables\\SuppData_1.csv")

### Upset Plots

In [None]:
chitest_upset = chitestRE[n].copy()

In [None]:
def abv(series):
    """
    Parameters
    __________
    series : pandas Series
        The Significance column of chitestRE[n]
        
    Returns
    _______
    sig_upset : list
        For each phenotype, specifies which racialized populations with AD
        have the phenotype significantly associated with them relative to matched
        controls. Abbreviated with A, B, L, W for upset plot
    """
    # input: pandas series; particularly the Significance column, which contains the tuple dtype
    
    sig_upset = list()
    
    for re in series:
        if len(re) > 0:
            sig_upset.append(re[-4])
    
    return sig_upset

In [None]:
chitest_upset['Sig_Upset'] = chitest_upset['Significance'].apply(abv)

In [None]:
chitest_upset['Sig_Upset'].value_counts().sort_index()

In [None]:
sig_overlaps = chitest_upset['Sig_Upset'].value_counts().sort_index()
sig_values = list(sig_overlaps)
# Add 0 to sig_values to indicate that no phenotypes found in common
# for Sig_A and Sig_W only
sig_values.append(0)
sig_values

In [None]:
# Add ('Sig_A', 'Sig_W') to sig_overlaps (0 overlap found)
sig_overlaps_list = list(sig_overlaps.index)

In [None]:
sig_overlaps_list.append(['A', 'W'],)
sig_overlaps_list

In [None]:
save = True
upset = upsetplot.from_memberships(sig_overlaps_list[1:],
                                   data=sig_values[1:])

upsetplot.plot(upset, show_counts=True)

# https://stackoverflow.com/questions/45148704/how-to-hide-axes-and-gridlines-in-matplotlib-python
plt.grid(False)

plt.ylabel('Significant Comorbidity \n Intersections', fontweight='bold')
plt.title('Number of Enriched Comorbidities \n Across Racialized Populations With AD \n', 
          fontweight='bold')

if save:
    plt.savefig('Figures/Fig_3.pdf', bbox_inches='tight')
plt.show()