In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
#from matplotlib_venn import venn2,venn3
from collections import Counter

pd.options.display.max_colwidth = 100

In [2]:
# !jupyter nbconvert --to script orthologs_JS.ipynb

In [2]:
def fill_missing_cols(df):
    
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
   
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)
    nan_cols_df.index = df.index
    return pd.concat([df,nan_cols_df],axis=1)

## Notebook for preprocessing human-mouse ortholog genes from HGNC HCOP 
From HGNC website: https://www.genenames.org/tools/hcop/   (select bulk download at the bottom)
The SAB (source ontology) for the mouse gene concept nodes is HGNC Comparison of Orthology Predictions (HCOP),so I will make the SAB attribute on the mouse gene Code nodes 'HGNC HCOP'

# The end of this workflow is different from the original orthologs.ipynb notebook located at /Users/stearb/Dropbox/CHOP/R03/code/orthologs, because we are using Jonathan Silversteins workflow for the Neo4j CSV creation (meaning the files produced by this workflow will be the inputs into JS's workflow) ...so we only need to create 2 files, a nodes.tsv and an edges.tsv (instead of the ~6 files, CUIs, CUI-CUIs, Code-CUIs, Terms, etc.)

## The guide for how to create these new nodes and edges files can be found on [github]( https://ubkg.docs.xconsortia.org/formats/)

# As of 1/6/24 we are using MGI as SAB for mouse gene nodes (previously HCOP)

In [3]:
hgnc_bulk = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/R03_local/data 2/orthologs/human_mouse_hcop_fifteen_column.txt',sep='\t')

assert hgnc_bulk.isna().sum().sum() == 0

print(hgnc_bulk.shape)
print(hgnc_bulk.columns)
hgnc_bulk.head(3)

(69937, 15)
Index(['human_entrez_gene', 'human_ensembl_gene', 'hgnc_id', 'human_name',
       'human_symbol', 'human_chr', 'human_assert_ids', 'mouse_entrez_gene',
       'mouse_ensembl_gene', 'mgi_id', 'mouse_name', 'mouse_symbol',
       'mouse_chr', 'mouse_assert_ids', 'support'],
      dtype='object')


Unnamed: 0,human_entrez_gene,human_ensembl_gene,hgnc_id,human_name,human_symbol,human_chr,human_assert_ids,mouse_entrez_gene,mouse_ensembl_gene,mgi_id,mouse_name,mouse_symbol,mouse_chr,mouse_assert_ids,support
0,1,ENSG00000121410,HGNC:5,alpha-1-B glycoprotein,A1BG,19q13.43,"ENOG5035G3W,P04217,11167,Phy00089XY_HUMAN,1,HGNC:5,ENSP00000263100,141638at9347,ENSG00000121410,...",117586,ENSMUSG00000022347,MGI:2152878,alpha-1-B glycoprotein,A1bg,15,"ENOG5035G3W,Q19LI2,11167,Phy001S3S6_MOUSE,117586,MGI:2152878,ENSMUSP00000094151,141638at9347,ENS...","EggNOG,Inparanoid,HomoloGene,PhylomeDB,NCBI,HGNC,Treefam,OrthoDB,Ensembl,OMA,Panther,OrthoMCL"
1,29974,ENSG00000148584,HGNC:24086,APOBEC1 complementation factor,A1CF,10q11.23,"Q9NQ94,ENOG5035F4P,16363,HGNC:24086,29974,ENSP00000363105,ENSG00000148584,67756at9347,HUMAN|HGNC...",69865,ENSMUSG00000052595,MGI:1917115,APOBEC1 complementation factor,A1cf,19,"Q5YD48,ENOG5035F4P,16363,MGI:1917115,69865,ENSMUSP00000075235,ENSMUSG00000052595,67756at9347,MOU...","Inparanoid,EggNOG,HomoloGene,HGNC,NCBI,Treefam,Ensembl,OrthoDB,Panther,OrthoMCL"
2,2,ENSG00000175899,HGNC:7,alpha-2-macroglobulin,A2M,12p13.31,"P01023,ENOG5035HJM,37248,HGNC:7,ENSP00000323929,13407at9347,ENSG00000175899,HUMAN|HGNC=7|UniProt...",232345,ENSMUSG00000030111,MGI:2449119,alpha-2-macroglobulin,A2m,6,"Q6GQT1,ENOG5035HJM,37248,MGI:2449119,ENSMUSP00000032203,13407at9347,ENSMUSG00000030111,MOUSE|MGI...","Inparanoid,EggNOG,HomoloGene,HGNC,Treefam,OrthoDB,Ensembl,Panther,OMA,OrthoMCL"


In [4]:
df = hgnc_bulk[['hgnc_id','mouse_symbol','mouse_name','mgi_id']]

In [5]:

df['CODE_mouse'] = ['HCOP:'+i for i in df['mouse_symbol']]
df['CodeID_mouse']  = ['HCOP '+i for i in df['CODE_mouse']]

In [6]:
df[['CodeID_mouse','mgi_id']].to_csv('mgi_name_map.csv',index=False)

In [7]:
hgnc_bulk['hgnc_id'] = ['HGNC '+ i for i in df['hgnc_id']]

In [8]:


nodes = df[['mgi_id','mouse_name','mouse_symbol']]
nodes.columns = ['node_id','node_label','node_synonyms']

nodes = nodes.drop_duplicates()
nodes = fill_missing_cols(nodes)

df['predicate'] =    'RO_HOM0000020'  # in 1 to 1 orthology relationship with, old rel:    'has_ortholog'

edges = df[['mgi_id','predicate','hgnc_id']]
edges.columns = ['subject','predicate','object']
edges.head(3)

Unnamed: 0,subject,predicate,object
0,MGI:2152878,RO_HOM0000020,HGNC:5
1,MGI:1917115,RO_HOM0000020,HGNC:24086
2,MGI:2449119,RO_HOM0000020,HGNC:7


In [9]:
edges = edges.drop_duplicates()

In [10]:
assert len(edges) == len(edges[edges['subject'].isin(nodes['node_id'])])

In [11]:
nodes['node_id'] = [i.replace(':',' ') for i in nodes['node_id']]
edges['subject'] = [i.replace(':',' ') for i in edges['subject']]
edges['object'] = [i.replace(':',' ') for i in edges['object']]

In [14]:
nodes.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/orthologs/OWLNETS_node_metadata_MGI.txt',
             sep='\t',index=False)

edges.to_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/orthologs/OWLNETS_edgelist_MGI.txt',
             sep='\t',index=False)