### Phecodes consolidated
Putting ICD9- and ICD10- mapped phecodes into one csv file. This version maps phecodes to corresponding phenotype from ICD-10 mapped phecodes where applicable.

In [None]:
import pandas as pd
import os

In [None]:
# Read in phecode files
icd9 = pd.read_csv("phecode_icd9_rolled.csv",  encoding='latin-1')
icd10 = pd.read_csv("Phecode_map_v1_2_icd10cm_beta.csv",  encoding='latin-1')

In [None]:
len(icd10['phecode'].unique())

In [None]:
icd9[icd9['ICD9 String'] == 'Cholera']

In [None]:
icd10[icd10['icd10cm_str'] == 'Cholera']

In [None]:
# Remove excess columns
icd9 = icd9.drop(['Rollup', 'Leaf', 'Ignore Bool'], axis=1)
icd10 = icd10.drop(['leaf', 'rollup'], axis=1)

In [None]:
# Change column names so that they are the same in the two dataframes:
# icd
# icd_string
# phecode
# phenotype
# excl_phecodes
# excl_phenotypes

In [None]:
icd9 = icd9.rename({'ICD9' : 'icd',
                    'ICD9 String' : 'icd_string',
                    'PheCode' : 'phecode',
                    'Phenotype' : 'phenotype',
                    'Excl. Phecodes' : 'excl_phecodes',
                    'Excl. Phenotypes' : 'excl_phenotypes'}, axis=1)
    
icd10 = icd10.rename({'icd10cm' : 'icd',
                      'icd10cm_str' : 'icd_string',
                      'phecode_str' : 'phenotype',
                      'exclude_range' : 'excl_phecodes',
                      'exclude_name' : 'excl_phenotypes'}, axis=1)

In [None]:
# Check same columns
icd9.columns == icd10.columns

In [None]:
len(icd9['phecode'].unique())

In [None]:
len(icd10['phecode'].unique())

#### Map ICD-9-mapped phecodes to corresponding phenotype for same ICD-10-mapped phecodes where applicable

In [None]:
# ICD-9-mapped phecodes also in ICD-10-mapped phecodes
icd9_b = icd9[icd9['phecode'].isin(icd10['phecode'])]

In [None]:
icd9_b.shape

In [None]:
icd9_b[['phecode', 'phenotype']].drop_duplicates().shape

In [None]:
# ICD-9-mapped phecodes that are NOT in ICD-10-mapped phecodes
# This will be left alone and eventually concatenated with icd9_b
# after corresponding phenotypes are mapped for that dataframe
icd9_only = icd9[~icd9['phecode'].isin(icd10['phecode'])]

In [None]:
# Only keep first three columns for icd9_b
icd9_b = icd9_b[['icd', 'icd_string', 'phecode']]

In [None]:
# Merge ICD-10 phenotype, excl_phecodes, excl_phenotypes
icd9_b = icd9_b.merge(icd10[['phecode', 'phenotype', 'excl_phecodes', 'excl_phenotypes']],
                      how='inner',
                      on='phecode').drop_duplicates()

In [None]:
icd9_b = icd9_b.reset_index().drop(['index'], axis=1)

In [None]:
# Concatenate icd9_b and icd9_only, replacing icd9
icd9 = pd.concat([icd9_b, icd9_only])

In [None]:
# Concatenate
icd = pd.concat([icd9, icd10])

In [None]:
icd.shape

In [None]:
# Check that concatenation worked
(icd9.shape[0] + icd10.shape[0]) == icd.shape[0]

Check only one phenotype per phecode

In [None]:
icd['phecode'].drop_duplicates().shape

In [None]:
icd['phenotype'].drop_duplicates().shape

In [None]:
icd[['phecode', 'phenotype']].drop_duplicates().shape

In [None]:
# Save
# Database name: phecodes_v2
#icd.to_csv('phecodes/all_phecodes_v2.csv', index=False)

In [None]:
pd.read_csv('all_phecodes_v2.csv')

#### Make modified icd dataframe with excl_phenotypes corresponding to values in the icd10 dataframe


In [None]:
# Excluded phenotypes in icd10 phecodes roughly correspond to ICD10 categories
icd10['excl_phenotypes'].value_counts()

In [None]:
# Excluded phenotypes in icd dataframe
icd['excl_phenotypes'].value_counts()

In [None]:
icd.columns

In [None]:
icd_trunc = icd[['icd', 'icd_string', 'phecode', 'phenotype', 'excl_phecodes']]

In [None]:
# Need to drop duplicates because otherwise it'll be one row per phecode value found 
# in icd10. So for example, there are 30 rows of phecode == 8.0 in icd10 df, so this
# phecode will show up 30x in the df below if duplicates are not dropped.
icd_trunc = icd_trunc.merge(right=icd10[['phecode', 'excl_phenotypes']],
                            on='phecode').drop_duplicates()

In [None]:
icd_trunc.shape

In [None]:
icd.shape

In [None]:
# Explore which phecodes are not in icd10 df
icd[~icd['phecode'].isin(icd_trunc['phecode'])]

In [None]:
icd[~icd['phecode'].isin(icd_trunc['phecode'])]['excl_phenotypes'].value_counts().index.to_list()

In [None]:
icd_trunc['excl_phenotypes'].value_counts()

In [None]:
len(icd_trunc['excl_phenotypes'].value_counts())

In [None]:
# Save
# Database name: phecodes_cat_v2
icd_trunc.to_csv('cat_phecodes_v2.csv', index=False)

In [None]:
len(icd_trunc['phecode'].unique())