In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from umls_utils import get_paths

# This Notebook creates relationships between HPO and HGNC Concept nodes in the UMLS graph

# The end of this workflow is different from the original HGNC-HPO.ipynb notebook located in /Users/stearb/Dropbox/CHOP/R03/code/HGNC-HPO, we are using Jonathan Silversteins workflow for the Neo4j CSV creation (meaning the files produced by this workflow will be the inputs into JS's workflow) ...so we only need to create 2 files, a nodes.tsv and an edges.tsv (instead of the ~6 files, CUIs, CUI-CUIs, Code-CUIs, Terms, etc.)

## The guide for how to create these new nodes and edges files can be found in the Data Distillerys [github](https://github.com/dbmi-pitt/UBKG/tree/main/user%20guide)

In [2]:
# !jupyter nbconvert --to script --no-prompt HGNC-HPO.ipynb

In [2]:
def fill_missing_cols(df):
    
    if 'node_id' not in df.columns: 
        raise ValueError('Must have at least a "node_id" column.')
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
    
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)

    if isinstance(df, pd.DataFrame):
        nan_cols_df.index = df.index
        return pd.concat([df,nan_cols_df],axis=1)
    elif isinstance(df, pl.DataFrame):
        # no index for polars
        return pl.concat([df,pl.from_pandas(nan_cols_df)],how='horizontal')
    else:
        raise ValueError(f'Must Pass either a pandas DataFrame or a polars DataFrame but recieved "{type(df)}".')


In [3]:
# Get paths from config file
config_path = '/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/'

data_dir,helper_data_dir,output_dir,LOCAL_CPU,umls_dir,umls_out_dir = get_paths(config_path)

if not  os.path.isdir(output_dir+'HGNC_HPO'):
    os.mkdir(output_dir+'HGNC_HPO')
    print('Creating HGNC_HPO directory...')

In [4]:
hgnc_hpo = pd.read_csv(data_dir+'hgnc_hpo_mappings.txt',
                       sep='\t',skiprows=1,header=None)

hgnc_hpo.columns = ['HPO-id','HPO label','entrez-gene-id','entrez-gene-symbol',
                            'Additional Info from G-D source','G-D source','disease-ID for link']

hgnc_hpo = hgnc_hpo[['HPO-id','entrez-gene-symbol']]
hgnc_hpo.rename(columns={'entrez-gene-symbol':'symbol'},inplace=True)
hgnc_hpo.head(5)

Unnamed: 0,HPO-id,symbol
0,HP:0000002,COX1
1,HP:0000002,AFF2
2,HP:0000002,ANAPC1
3,HP:0000002,MAP2K1
4,HP:0000002,KDM6A


### Load in hgnc_master, which contains the gene_symbol - hgnc_code mappings

In [4]:
hgnc_master = pd.read_csv(helper_data_dir+'hgnc_master_2cols.txt')
hgnc_master.drop('Unnamed: 0',axis=1,inplace=True)
hgnc_master.head(3)

Unnamed: 0,hgnc_id,symbol
0,HGNC:5,A1BG
1,HGNC:37133,A1BG-AS1
2,HGNC:24086,A1CF


### Merge in HGNC Codes

In [5]:
hgnc_hpo.head(1)

hgnc_hpo_mappings = pd.merge(hgnc_hpo,hgnc_master,on='symbol')

hgnc_hpo_mappings.drop('symbol',axis=1,inplace=True)
hgnc_hpo_mappings.rename(columns={'hgnc_id':'HGNC_ID'},inplace=True)

hgnc_hpo_mappings.drop_duplicates(inplace=True)
hgnc_hpo_mappings.head(3)

Unnamed: 0,HPO-id,HGNC_ID
0,HP:0000002,HGNC:3776
1,HP:0000152,HGNC:3776
3,HP:0000153,HGNC:3776


In [13]:
df = hgnc_hpo_mappings

In [15]:
df['predicate'] = 'associated_with'
df

Unnamed: 0,HPO-id,HGNC_ID,predicate
0,HP:0000002,HGNC:3776,associated_with
1,HP:0000152,HGNC:3776,associated_with
3,HP:0000153,HGNC:3776,associated_with
4,HP:0000159,HGNC:3776,associated_with
5,HP:0000163,HGNC:3776,associated_with
...,...,...,...
969752,HP:0011024,HGNC:29597,associated_with
969753,HP:0012647,HGNC:29597,associated_with
969754,HP:0012649,HGNC:29597,associated_with
969755,HP:0012718,HGNC:29597,associated_with


In [16]:
edges = df[['HGNC_ID','predicate','HPO-id']]
edges.columns = ['subject','predicate','object']
edges

Unnamed: 0,subject,predicate,object
0,HGNC:3776,associated_with,HP:0000002
1,HGNC:3776,associated_with,HP:0000152
3,HGNC:3776,associated_with,HP:0000153
4,HGNC:3776,associated_with,HP:0000159
5,HGNC:3776,associated_with,HP:0000163
...,...,...,...
969752,HGNC:29597,associated_with,HP:0011024
969753,HGNC:29597,associated_with,HP:0012647
969754,HGNC:29597,associated_with,HP:0012649
969755,HGNC:29597,associated_with,HP:0012718


# Can we include these nodes without including there Terms? If they are already in the graph, they won't be overwritten with no Terms right?

In [18]:
nodes = pd.DataFrame(pd.concat([edges['subject'].drop_duplicates(),edges['object'].drop_duplicates()]))
nodes.columns = ['node_id']

'''nodes['node_label'] = np.nan
nodes['node_synonyms'] = np.nan
nodes['node_namespace'] = np.nan
nodes['node_dbxrefs'] = np.nan
nodes['node_definition'] = np.nan'''

"nodes['node_label'] = np.nan\nnodes['node_synonyms'] = np.nan\nnodes['node_namespace'] = np.nan\nnodes['node_dbxrefs'] = np.nan\nnodes['node_definition'] = np.nan"

In [20]:
nodes = nodes.reset_index(drop=True)

In [21]:
nodes = fill_missing_cols(nodes)
nodes

Unnamed: 0,node_id,node_definition,node_label,node_namespace,value,node_dbxrefs,node_synonyms,upperbound,unit,lowerbound
0,HGNC:3776,,,,,,,,,
1,HGNC:19988,,,,,,,,,
2,HGNC:6840,,,,,,,,,
3,HGNC:12637,,,,,,,,,
4,HGNC:15455,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
14333,HP:0032037,,,,,,,,,
14334,HP:0100757,,,,,,,,,
14335,HP:0005423,,,,,,,,,
14336,HP:0012570,,,,,,,,,


In [22]:
nodes = nodes.dropna(subset=['node_id']).reset_index(drop=True)

In [24]:
nodes = nodes.reset_index(drop=True)

In [25]:
nodes.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/hgnc_hpo/OWLNETS_node_metadata.txt',
             sep='\t',index=False)

edges.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/hgnc_hpo/OWLNETS_edgelist.txt',
             sep='\t',index=False)