## Table of phenotypes for AD patients that overlap for all racialized populations

In [1]:
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
from scipy.stats import norm
from scipy.stats import chi2_contingency
import scipy.stats as stats
from math import log10, log2
from tqdm import tqdm
import warnings
# 20211102 Adding in statsmodels oddsratio_confint
import statsmodels.stats.contingency_tables 
import statsmodels.api as sm
warnings.filterwarnings("ignore", category=FutureWarning) 

In [2]:
os.chdir("..")

In [3]:
pd.set_option('display.max_rows', 200)
np.set_printoptions(threshold=50)

In [4]:
diagkeys = ['phenotype']

In [5]:
%run -i setup_functions.py

# Import data

In [6]:
total_ad = 7409 #Total alzheimer patients
total_con = 7409*2 #Total control patients

In [7]:
# Specifically Diagnoses Names
ad_diag_all = pd.read_csv('Diagnoses/phecode_diagnoses/ad_diagnoses.csv')

con_diag_all = pd.read_csv('Diagnoses/phecode_diagnoses/con_diagnoses.csv')

# demographics
ad_demo = pd.read_csv('Demographics/ad_demographics.csv')
con_demo = pd.read_csv('Demographics/con_demographics.csv')

In [8]:
# add column that indicates order icd10_chapter
# NOTE: icd10_chapter ROUGHLY corresponds to icd-10 chapters, and some chapters are not included
ad_diag_all['chp_order'] = ad_diag_all['icd10_chapter'].apply(ICDname_order)
con_diag_all['chp_order'] = con_diag_all['icd10_chapter'].apply(ICDname_order)

In [9]:
# Only keep diagnoses mapped to phecodes that are organized into ICD-10 inspired chapters
ad_diag = ad_diag_all[~ad_diag_all['icd10_chapter'].isnull()]
con_diag = con_diag_all[~con_diag_all['icd10_chapter'].isnull()]

In [10]:
ad_diag_null = ad_diag_all[ad_diag_all['icd10_chapter'].isnull()]

In [11]:
# ad_diag retains all patients; con_diag does not

# Number of unique AD patients:
ad_diag['person_id'].drop_duplicates().shape[0]

7409

In [12]:
# Number of unique control patients: almost half were removed this way
con_diag['person_id'].drop_duplicates().shape[0]

7610

In [13]:
# Merge con_demo info to retain the remaining patients:
con_diag = con_demo['person_id'].to_frame().merge(con_diag,
                                                  how='left',
                                                  on='person_id')

In [14]:
# Check that all patients now included
con_diag['person_id'].drop_duplicates().shape[0]

14818

#### Only keep the following columns: 'person_id', 'PatientDurableKey', 'phecode', 'phenotype', 'icd10_chapter'

In [15]:
ad_diag = ad_diag[['person_id', 
                   'PatientDurableKey',
                   'phecode',
                   'phenotype',
                   'icd10_chapter']].copy().drop_duplicates()
con_diag = con_diag[['person_id', 
                     'PatientDurableKey',
                     'phecode',
                     'phenotype',
                     'icd10_chapter']].copy().drop_duplicates()

## Include UCSFDerivedRaceEthnicity_Clean column to ad_diag and con_diag dataframes

In [16]:
# Add race/ethnicity column to ad_diag df
ad_diag = ad_diag.merge(ad_demo[['person_id', 'UCSFDerivedRaceEthnicity_Clean']], 
                                 how='left', 
                                 left_on='person_id', 
                                 right_on='person_id')

In [17]:
# Add race/ethnicity column to con_diag df
con_diag = con_diag.merge(con_demo[['person_id', 'UCSFDerivedRaceEthnicity_Clean']], 
                          how='left', 
                          left_on='person_id', 
                          right_on='person_id')

In [18]:
ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique()

array(['White or Caucasian', 'Unknown/Declined', 'Other', 'Asian',
       'Black or African American', 'Latinx',
       'Native Hawaiian or Other Pacific Islander',
       'American Indian or Alaska Native', 'Multi-Race/Ethnicity'],
      dtype=object)

### Only keep MatchIt person_ids for ad_diag and con_diag

In [19]:
# Get person_ids for MatchIt Alzheimer's and Control patients
ad_MatchIt = pd.read_csv('Demographics/RE_MI_ad_demo.csv')
con_MatchIt = pd.read_csv('Demographics/RE_MI_con_demo.csv')

In [20]:
# 422 patients each
ad_MatchIt.shape

(1688, 10)

In [21]:
# 844 patients each
con_MatchIt.shape

(3376, 10)

In [22]:
ad_diag = ad_diag[ad_diag['person_id'].isin(ad_MatchIt['person_id'])]
con_diag = con_diag[con_diag['person_id'].isin(con_MatchIt['person_id'])]

#### Phenotypes in common for each racialized population

In [23]:
r_and_e = ad_diag['UCSFDerivedRaceEthnicity_Clean'].unique()

re_phenotypes = dict()

# get phenotypes present for each R&E for AD and control patients
for re in r_and_e:
    re_phenotypes[re+'_AD'] = set(ad_diag[ad_diag['UCSFDerivedRaceEthnicity_Clean'] == re]['phenotype'].unique())
    re_phenotypes[re+'_con'] = set(con_diag[con_diag['UCSFDerivedRaceEthnicity_Clean'] == re]['phenotype'].unique())

In [24]:
r_and_e

array(['White or Caucasian', 'Black or African American', 'Latinx',
       'Asian'], dtype=object)

In [25]:
common_phenotypes_AD = re_phenotypes['Asian_AD'] & \
                    re_phenotypes['Black or African American_AD'] & \
                    re_phenotypes['Latinx_AD'] & \
                    re_phenotypes['White or Caucasian_AD']

In [26]:
common_phenotypes_con = re_phenotypes['Asian_con'] & \
                    re_phenotypes['Black or African American_con'] & \
                    re_phenotypes['Latinx_con'] & \
                    re_phenotypes['White or Caucasian_con']

In [27]:
len(common_phenotypes_AD)

931

In [28]:
len(common_phenotypes_con)

898

In [29]:
common_phenotypes_UCSF = common_phenotypes_AD | common_phenotypes_con

In [30]:
len(common_phenotypes_UCSF)

1056

#### Phenotypes overlapping for both AD and control

In [31]:
common_phenotypes_UCSF_dict = {'common_phenotypes_UCSF' : list(common_phenotypes_UCSF)}
common_phenotypes_UCSF_pd = pd.DataFrame.from_dict(common_phenotypes_UCSF_dict)

In [32]:
# Save
common_phenotypes_UCSF_pd.sort_values(by='common_phenotypes_UCSF').to_csv('Tables/common_phenotypes_UCSF.csv')

#### Phenotypes overlapping for AD only

In [33]:
common_phenotypes_UCSF_AD_dict = {'common_phenotypes_UCSF_AD' : list(common_phenotypes_AD)}
common_phenotypes_UCSF_AD_pd = pd.DataFrame.from_dict(common_phenotypes_UCSF_AD_dict)

In [34]:
# Save
common_phenotypes_UCSF_AD_pd.sort_values(by='common_phenotypes_UCSF_AD').to_csv('Tables/common_phenotypes_UCSF_AD.csv')