In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import os
from umls_utils import get_paths, CUIbase64

In [4]:
# Get paths from config file
config_path = '/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/'
data_dir,helper_data_dir,output_dir,LOCAL_CPU, umls_dir, umls_out_dir = get_paths(config_path)


if not  os.path.isdir(output_dir+'kf_phenotypes'):
    os.mkdir(output_dir+'kf_phenotypes')
    print('Creating kf_phenotypes directory...')



In [5]:
output_dir

'/Users/stearb/Desktop/R03_local/data/use_config/OUTPUT_FILES/'

In [11]:
!ls '/Users/stearb/Desktop/R03_local/data/use_config/OUTPUT_FILES/'

[34mGTEx[m[m           [34mgenopheno[m[m      [34mhpo_mp_mapping[m[m [34morthologs[m[m
[34mMPO[m[m            [34mhgnc_annos[m[m     [34mkf_phenotypes[m[m


In [3]:

kf = pd.read_excel(data_dir'KF_April2020_PosPhenos_HPO_Taylor.xlsx',
                   sheet_name='PositivePhenotypes')
print(kf.shape)
kf.head(3)

(12031, 8)


Unnamed: 0,Project Name,participant_id,source_text_phenotype,hpo_id_phenotype,observed,Label,Synonyms,Description
0,"National Heart, Lung, and Blood Institute (NHL...",PT_GD9D1VNT,neurogenic bladder,HP:0000011,Positive,Neurogenic bladder,Lack of bladder control due to nervous system ...,A type of bladder dysfunction caused by neurol...
1,Genomic Analysis of Congenital Diaphragmatic H...,PT_VG1P6Z4G,inguinal hernia,HP:0000023,Positive,Inguinal hernia,,Protrusion of the contents of the abdominal ca...
2,"National Heart, Lung, and Blood Institute (NHL...",PT_0YBD5Z93,Inguinal Hernia,HP:0000023,Positive,Inguinal hernia,,Protrusion of the contents of the abdominal ca...


In [4]:
kf.nunique()

Project Name               13
participant_id           4266
source_text_phenotype    1338
hpo_id_phenotype          787
observed                    3
Label                     787
Synonyms                  590
Description               722
dtype: int64

In [5]:
df = kf[['participant_id','hpo_id_phenotype']].rename(
                    columns={'participant_id':'KF_CODE','hpo_id_phenotype':'HPO_CODE'})

df['KF_CodeID'] = ['KF ' + i for i in df['KF_CODE']]
df['HPO_CodeID'] = ['HPO ' + i for i in df['HPO_CODE']]


In [7]:
umls_dir #'/Users/stearb/Desktop/R03_local/data/use_config/HELPER_FILES/'

'/Users/stearb/Desktop/hubmap-kg/new_build_csv_data/'

In [6]:

# From CSVs
UMLS_CUI_CODE = pd.read_csv(umls_dir+'CUI-CODEs.csv')

umls_hpo = UMLS_CUI_CODE[UMLS_CUI_CODE[':END_ID'].str.startswith('HPO')].rename(
                    columns={':END_ID':'HPO_CODE',':START_ID':'HPO_CUI'})

umls_hpo.rename(columns={'HPO_CODE':'HPO_CodeID','HPO_CONCEPT':'CUI_HPO'},inplace=True)

#umls_hpo['CODE_HPO'] = [i.split(' ')[1] for i in umls_hpo['CODE_HPO']]

umls_hpo.head(3)

Unnamed: 0,HPO_CUI,HPO_CodeID
16,C0000778,HPO HP:0032224
107,C0002447,HPO HP:0009827
188,C0003463,HPO HP:0032186


In [7]:
df['KF_CUI'] = CUIbase64(df['KF_CodeID'])

df.head(3)

Unnamed: 0,KF_CODE,HPO_CODE,KF_CodeID,HPO_CodeID,KF_CUI
0,PT_GD9D1VNT,HP:0000011,KF PT_GD9D1VNT,HPO HP:0000011,S0YgUFRfR0Q5RDFWTlQ=
1,PT_VG1P6Z4G,HP:0000023,KF PT_VG1P6Z4G,HPO HP:0000023,S0YgUFRfVkcxUDZaNEc=
2,PT_0YBD5Z93,HP:0000023,KF PT_0YBD5Z93,HPO HP:0000023,S0YgUFRfMFlCRDVaOTM=


In [19]:
df_merge = pd.merge(df,umls_hpo)
df_merge.nunique()

KF_CODE       4265
HPO_CODE       784
KF_CodeID     4265
HPO_CodeID     784
KF_CUI        4265
HPO_CUI       1002
dtype: int64

CUIs

In [56]:
CUIs = df_merge['KF_CUI'].rename('CUI:ID')
CUIs.to_csv(output_dir+'kf_phenotypes/CUIs_kf.csv',index=False)

CUI-CUIs

In [22]:
CUI_CUIs = df_merge[['KF_CUI','HPO_CUI']].rename(columns={'KF_CUI':':START_ID','HPO_CUI':':END_ID'})

CUI_CUIs[':TYPE'] = 'patient_has_phenotype'
CUI_CUIs['SAB'] = 'KF_HPO'

CUI_CUIs_inverse = CUI_CUIs.rename(columns={':END_ID':':START_ID',':START_ID':':END_ID'})
CUI_CUIs_inverse = CUI_CUIs_inverse[[':START_ID',':END_ID',':TYPE','SAB']]
CUI_CUIs_inverse[':TYPE'] = 'phenotype_of_patient'
CUI_CUIs_inverse['SAB'] = 'KF_HPO'

CUI_CUIs_all = pd.concat([CUI_CUIs,CUI_CUIs_inverse])

CUI_CUIs_all.to_csv(output_dir+'kf_phenotypes/CUI_CUIs_kf.csv',index=False)

In [23]:
df_merge.head(3)

Unnamed: 0,KF_CODE,HPO_CODE,KF_CodeID,HPO_CodeID,KF_CUI,HPO_CUI
0,PT_GD9D1VNT,HP:0000011,KF PT_GD9D1VNT,HPO HP:0000011,S0YgUFRfR0Q5RDFWTlQ=,C0005697
1,PT_VG1P6Z4G,HP:0000023,KF PT_VG1P6Z4G,HPO HP:0000023,S0YgUFRfVkcxUDZaNEc=,C0019294
2,PT_0YBD5Z93,HP:0000023,KF PT_0YBD5Z93,HPO HP:0000023,S0YgUFRfMFlCRDVaOTM=,C0019294


CODEs

In [24]:
CODEs = df_merge[['KF_CodeID','KF_CODE']]

CODEs['SAB'] = 'KF'
CODEs = CODEs[['KF_CodeID','SAB','KF_CODE']]

CODEs.to_csv(output_dir+'kf_phenotypes/CODEs_kf.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


CUI-CODEs

In [81]:
CUI_CODEs = df_merge[['KF_CUI','KF_CodeID']].rename(columns={'KF_CUI':':START_ID','KF_CODE':':END_ID'})

CUI_CODEs.to_csv(outpath+'kf_phenotypes/CUI_CODEs_kf.csv',index=False)

In [33]:

a=pd.concat([CUI_CUIs[':START_ID'],CUI_CUIs[':END_ID']])
a= pd.DataFrame(a,columns=['CUI'])

In [61]:
b= pd.DataFrame(CUIs.values,columns=['CUI'])

In [57]:
CUIs.values

array(['S0YgUFRfR0Q5RDFWTlQ=', 'S0YgUFRfVkcxUDZaNEc=',
       'S0YgUFRfMFlCRDVaOTM=', ..., 'S0YgUFRfWVZOR0I3UzE=',
       'S0YgUFRfMldKUTRYWFY=', 'S0YgUFRfSDEzUTdTNjQ='], dtype=object)

In [58]:
cuis_in = a[~a['CUI'].isin(b['CUI'])]

In [59]:
sum(cuis_in['CUI'].str.startswith('C')) == len(cuis_in)

True

In [63]:
b.nunique()

CUI    4265
dtype: int64

In [78]:

a_noC = [i if not i.startswith('C')==True else np.nan for i  in a['CUI']]
#Counter(a_noC).most_common()
a_noC = pd.DataFrame(a_noC,columns=['CUI']).dropna()
a_noC.nunique()

CUI    4265
dtype: int64