In [87]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from umls_utils import get_paths

In [86]:
!jupyter nbconvert --to script --no-prompt HGNC-HPO.ipynb

[NbConvertApp] Converting notebook HGNC-HPO.ipynb to script
[NbConvertApp] Writing 3171 bytes to HGNC-HPO.py


In [88]:
# Get paths from config file
config_path = '/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/'

data_dir,helper_data_dir,output_dir,LOCAL_CPU,umls_dir,umls_out_dir = get_paths(config_path)

if not  os.path.isdir(output_dir+'HGNC_HPO'):
    os.mkdir(output_dir+'HGNC_HPO')
    print('Creating HGNC_HPO directory...')

In [89]:
hgnc_hpo = pd.read_csv(data_dir+'hgnc_hpo_mappings.txt',
                       sep='\t',skiprows=1,header=None)

hgnc_hpo.columns = ['HPO-id','HPO label','entrez-gene-id','entrez-gene-symbol',
                            'Additional Info from G-D source','G-D source','disease-ID for link']

hgnc_hpo = hgnc_hpo[['HPO-id','entrez-gene-symbol']]
hgnc_hpo.rename(columns={'entrez-gene-symbol':'symbol'},inplace=True)
hgnc_hpo.head(5)

Unnamed: 0,HPO-id,symbol
0,HP:0000002,COX1
1,HP:0000002,AFF2
2,HP:0000002,ANAPC1
3,HP:0000002,MAP2K1
4,HP:0000002,KDM6A


### Load in hgnc_master, which contains the gene_symbol - hgnc_code mappings

In [90]:
hgnc_master = pd.read_csv(helper_data_dir+'hgnc_master_2cols.txt')
hgnc_master.drop('Unnamed: 0',axis=1,inplace=True)
hgnc_master.head(3)

Unnamed: 0,hgnc_id,symbol
0,HGNC:5,A1BG
1,HGNC:37133,A1BG-AS1
2,HGNC:24086,A1CF


### Merge in HGNC Codes

In [91]:
hgnc_hpo.head(1)

hgnc_hpo_mappings = pd.merge(hgnc_hpo,hgnc_master,on='symbol')

hgnc_hpo_mappings.drop('symbol',axis=1,inplace=True)
hgnc_hpo_mappings.rename(columns={'hgnc_id':'HGNC_ID'},inplace=True)

hgnc_hpo_mappings.drop_duplicates(inplace=True)
hgnc_hpo_mappings.head(3)

Unnamed: 0,HPO-id,HGNC_ID
0,HP:0000002,HGNC:3776
1,HP:0000152,HGNC:3776
3,HP:0000153,HGNC:3776


### Merge in HGNC CUIs

In [92]:
# GET CUI - HGNC CODE MAPPINGS STRAIGHT FROM CSVs
# UMLS_CUI_CODEs = pd.read_csv(umls_dir+'CUI-CODEs.csv')
UMLS_CUI_CODEs = pd.read_pickle(umls_dir+'CUI-CODEs.pickle')

umls_genes = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HGNC')].rename(
                                    columns={':START_ID':'CUI_hgnc',':END_ID':'HGNC_ID'})

umls_genes['HGNC_ID'] = [i.split(' ')[1] for i in umls_genes['HGNC_ID']]
umls_genes.head(3)

Unnamed: 0,CUI_hgnc,HGNC_ID
14067,C0694879,HGNC:3513
24230,C1332096,HGNC:546
24231,C1332123,HGNC:896


In [93]:
df = pd.merge(hgnc_hpo_mappings,umls_genes,on='HGNC_ID')
df.head(3)

Unnamed: 0,HPO-id,HGNC_ID,CUI_hgnc
0,HP:0000002,HGNC:3776,C1826605
1,HP:0000152,HGNC:3776,C1826605
2,HP:0000153,HGNC:3776,C1826605


### Merge in HPO CUIs

In [94]:
hpo_cuis = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HPO')].rename(
                                    columns={':START_ID':'CUI_hpo',':END_ID':'HPO-id'})

hpo_cuis['HPO-id'] = [i.split(' ')[1] for i in hpo_cuis['HPO-id']]

hpo_cuis.head(2)

final = pd.merge(df,hpo_cuis,on='HPO-id')
final.drop(['HPO-id','HGNC_ID'],axis=1,inplace=True)
final.drop_duplicates(inplace=True)
final.head(3)

Unnamed: 0,CUI_hgnc,CUI_hpo
0,C1826605,C4025901
1,C1426585,C4025901
2,C1334474,C4025901


### CUI-CUIs (Create forwards and reverse relationships)

In [95]:
forwards = final.rename(columns={'CUI_hgnc':':START_ID','CUI_hpo':':END_ID'})
forwards[':TYPE'] = 'gene_associated_with_phenotype'


reverse = final.rename(columns={'CUI_hgnc':':END_ID','CUI_hpo':':START_ID'})
reverse = reverse[[':START_ID',':END_ID']]
reverse[':TYPE'] = 'phenotype_associated_with_gene'

CUI_CUIs = pd.concat([forwards,reverse])
CUI_CUIs['SAB'] = 'HGNC__HPO'

CUI_CUIs.to_pickle(output_dir+'HGNC_HPO/CUI_CUIs_hgnc_hpo.pickle')


In [98]:
forwards.nunique()

:START_ID     4545
:END_ID      10896
:TYPE            1
dtype: int64