In [1]:
import numpy as np
import os
import pandas as pd
#from matplotlib_venn import venn2,venn3
from collections import Counter
from umls_utils import get_paths, CUIbase64
import warnings
warnings.filterwarnings('ignore')

In [3]:
!jupyter nbconvert --to script orthologs-Copy1.ipynb
!sed -i '' '/.head(/d' orthologs-Copy1.py
!sed -i '' '/^#/d' orthologs-Copy1.py
!sed -i '' '/get_ipython()/d' orthologs-Copy1.py
!sed -i '' '/print/d' orthologs-Copy1.py

[NbConvertApp] Converting notebook orthologs-Copy1.ipynb to script
[NbConvertApp] Writing 11004 bytes to orthologs-Copy1.py


In [6]:
# Get paths from config file
config_path = '/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/'

data_dir,helper_data_dir,output_dir,LOCAL_CPU, umls_dir,\
            umls_out_dir = get_paths('/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/')

 
if not os.path.isdir(output_dir+'orthologs'):
    os.mkdir(output_dir+'orthologs')
    print('Creating orthologs directory...')
   

## Notebook for downloading/cleaning human-mouse ortholog genes from HGNC HCOP for ingest into a Neo4j instance
From HGNC website: https://www.genenames.org/tools/hcop/   (select bulk download at the bottom)
The SAB (source ontology) for the mouse gene concept nodes is HGNC Comparison of Orthology Predictions (HCOP),so I will make the SAB attribute on the mouse gene Code nodes 'HGNC HCOP'

In [7]:

if LOCAL_CPU:
    hgnc_bulk = pd.read_csv('/Users/stearb/downloads/human_mouse_hcop_fifteen_column.txt',sep='\t')
else:
    hgnc_bulk = pd.read_csv(data_dir+'human_mouse_hcop_fifteen_column.txt',sep='\t')


assert hgnc_bulk.isna().sum().sum() == 0

print(hgnc_bulk.shape)
print(hgnc_bulk.columns)
hgnc_bulk.head(3)

(69937, 15)
Index(['human_entrez_gene', 'human_ensembl_gene', 'hgnc_id', 'human_name',
       'human_symbol', 'human_chr', 'human_assert_ids', 'mouse_entrez_gene',
       'mouse_ensembl_gene', 'mgi_id', 'mouse_name', 'mouse_symbol',
       'mouse_chr', 'mouse_assert_ids', 'support'],
      dtype='object')


Unnamed: 0,human_entrez_gene,human_ensembl_gene,hgnc_id,human_name,human_symbol,human_chr,human_assert_ids,mouse_entrez_gene,mouse_ensembl_gene,mgi_id,mouse_name,mouse_symbol,mouse_chr,mouse_assert_ids,support
0,1,ENSG00000121410,HGNC:5,alpha-1-B glycoprotein,A1BG,19q13.43,"ENOG5035G3W,P04217,11167,Phy00089XY_HUMAN,1,HG...",117586,ENSMUSG00000022347,MGI:2152878,alpha-1-B glycoprotein,A1bg,15,"ENOG5035G3W,Q19LI2,11167,Phy001S3S6_MOUSE,1175...","EggNOG,Inparanoid,HomoloGene,PhylomeDB,NCBI,HG..."
1,29974,ENSG00000148584,HGNC:24086,APOBEC1 complementation factor,A1CF,10q11.23,"Q9NQ94,ENOG5035F4P,16363,HGNC:24086,29974,ENSP...",69865,ENSMUSG00000052595,MGI:1917115,APOBEC1 complementation factor,A1cf,19,"Q5YD48,ENOG5035F4P,16363,MGI:1917115,69865,ENS...","Inparanoid,EggNOG,HomoloGene,HGNC,NCBI,Treefam..."
2,2,ENSG00000175899,HGNC:7,alpha-2-macroglobulin,A2M,12p13.31,"P01023,ENOG5035HJM,37248,HGNC:7,ENSP0000032392...",232345,ENSMUSG00000030111,MGI:2449119,alpha-2-macroglobulin,A2m,6,"Q6GQT1,ENOG5035HJM,37248,MGI:2449119,ENSMUSP00...","Inparanoid,EggNOG,HomoloGene,HGNC,Treefam,Orth..."


## **** Need to decide what to do with the genes that don't have HGNC IDs.

In [8]:
# Remove rows where there is no hgnc_id (represented by '-')
predrop_len = len(hgnc_bulk)

hgnc_bulk = hgnc_bulk[hgnc_bulk['hgnc_id'] != '-']

print(str(predrop_len-len(hgnc_bulk)) + ' rows dropped because of no HGNC ID.')

1618 rows dropped because of no HGNC ID.


In [9]:
#hgnc_bulk[['human_ensembl_gene','hgnc_id','human_symbol']].to_csv('hgnc_ensembl_genes.csv')

In [10]:
nunique = hgnc_bulk.nunique()
#nunique['hgnc_id']

20988

In [11]:
#unq_hgnc = hgnc_bulk['hgnc_id'].nunique()
#unq_human = hgnc_bulk['human_symbol'].nunique()
#unq_mouse = hgnc_bulk['mouse_symbol'].nunique()
nunique = hgnc_bulk.nunique()

print(f"There are {nunique['hgnc_id']} unique hgnc ids in the df.")
print(f"There are {nunique['human_symbol']} unique human genes in the df.")
print(f"There are {nunique['mouse_symbol']} unique mouse genes in the df.")

There are 20988 unique hgnc ids in the df.
There are 20988 unique human genes in the df.
There are 22448 unique mouse genes in the df.


In [12]:
# Drop duplicates rows only if hgnc_id,human_symbol and mouse_symbol are all the same.
pre_drop_len = len(hgnc_bulk)
hgnc_bulk.drop_duplicates(['hgnc_id','human_symbol','mouse_symbol'],inplace=True)
print(str(pre_drop_len-len(hgnc_bulk)) + ' duplicate rows dropped.')

126 duplicate rows dropped.


In [13]:
# match(n:Code {SAB:'HGNC'}) return count(n)   ---> 41,638 HGNC nodes in UMLS

# select just the columns we need.
df=hgnc_bulk[['hgnc_id','mouse_symbol','mouse_ensembl_gene','mgi_id','mouse_name']]

In [14]:
# Drop rows if any of these values are '-' 
pre_drop_len = len(df)

df = df[(df != '-').all(axis=1)]

print(str(pre_drop_len-len(df)) + ' rows dropped because of "-".')

1999 rows dropped because of "-".


### Create CodeID and CUI properties for the mouse gene nodes, using the same format as UMLS 

In [15]:

df['CODE_mouse'] = ['HCOP:'+i for i in df['mouse_symbol']]
df['CodeID_mouse']  = ['HCOP '+i for i in df['CODE_mouse']]

#CUI_LEN = 14
#df['CUI_mouse']  = ['KC' + str(int(hashlib.sha256(uid.encode('utf8')).hexdigest(),base=16))[:CUI_LEN]for uid in df['mouse_symbol']]

df['CUI_mouse'] =  CUIbase64(df['CodeID_mouse'])

assert len(df['mouse_symbol'].unique())  ==  len(df['CUI_mouse'].unique()) 
assert len(df['mouse_symbol'].unique())  ==  len(df['CODE_mouse'].unique()) 

reorder_cols  = ['CUI_mouse','CODE_mouse', 'CodeID_mouse','hgnc_id','mouse_symbol',
                                         'mouse_ensembl_gene','mgi_id', 'mouse_name']
df = df[reorder_cols]

### Load HGNC CUI/CodeIDs 

We need to connect the gene nodes at the concept level and not at the code level. 
we can just line up the 'HGNC' CUIs with the 'HCOP HGNC' CUIs.
To do this we need the 'HGNC' CUIs from UMLS, with their corresponding 'HGNC' Code 
(Because the HGNC CUIs have no information attached to them so we need to bring the HGNC code along with it)

Cypher Query used: match (n:Code)--(m:Concept) where n.SAB = 'HGNC' return n.CODE as HGNC_CODE,m.CUI AS HGNC_CONCEPT 

In [16]:
# OLD WAY OF GETTING UMLS HGNC CUI-CODE mappings 
#if LOCAL_CPU: umls_genes  = pd.read_csv('/Users/stearb/desktop/R03_local/data/umls-genes-concepts-codes.csv')
#else: umls_genes  = pd.read_csv(helper_data_dir+'umls-genes-concepts-codes.csv') 
#umls_genes.rename(columns={'Concept':'CUI_human','Code':'hgnc_id'},inplace=True)

# Get them straight from CSVs
UMLS_CUI_CODEs = pd.read_pickle(umls_dir+'CUI-CODEs.pickle')

umls_genes = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HGNC')].rename(
                                                    columns={':START_ID':'CUI_human',':END_ID':'hgnc_id'})

umls_genes['hgnc_id'] = [i.split(' ')[1] for i in umls_genes['hgnc_id']]

umls_genes.head(3)

Unnamed: 0,CUI_human,hgnc_id
14900,C0694879,HGNC:3513
25633,C1332096,HGNC:546
25636,C1332123,HGNC:896


In [2]:
#venn2([set(umls_genes['hgnc_id']),set(df['hgnc_id'])],set_labels=('umls hgnc ids','hgnc-hcop coverage'))

In [17]:
# Select just the shared hgnc_ids from the umls dataframe
umls_genes_shared  = umls_genes[umls_genes['hgnc_id'].isin(df['hgnc_id'])]

# Select all col's except 'mouse_ensembl_gene'
#df_select = df[['CUI_mouse','CODE_mouse','CodeID_mouse','hgnc_id','mouse_symbol','mgi_id','mouse_name']]

# Merge in HGNC CUIs 
CUI2CUI_genes = pd.merge(left=umls_genes_shared,right=df,on='hgnc_id')


CUI2CUI_genes.rename(columns={'hgnc_id':'CODE_human'},inplace=True)#,'CodeID_mouse':'CODE_mouse'

#reorder_cols = ['CUI_human', 'CUI_mouse','CODE_human', 'CODE_mouse']
#CUI2CUI_genes = CUI2CUI_genes[reorder_cols]
print(CUI2CUI_genes.nunique())


CUI_human             20739
CODE_human            20741
CUI_mouse             21973
CODE_mouse            21973
CodeID_mouse          21973
mouse_symbol          21973
mouse_ensembl_gene    21963
mgi_id                21973
mouse_name            21962
dtype: int64


In [18]:
#CUI2CUI_genes[['CODE_human', 'CUI_human', 'CUI_mouse', 'CODE_mouse', 'CodeID_mouse',
#       'mouse_symbol']].head(2)

Unnamed: 0,CODE_human,CUI_human,CUI_mouse,CODE_mouse,CodeID_mouse,mouse_symbol
0,HGNC:3513,C0694879,SENPUCBIQ09QOkV4dDI=,HCOP:Ext2,HCOP HCOP:Ext2,Ext2
1,HGNC:546,C1332096,SENPUCBIQ09QOkFueGE4,HCOP:Anxa8,HCOP HCOP:Anxa8,Anxa8


In [19]:
# save this just once, the glygen needs thiis data when ingesting glycans and glycotransferases for mouse genes.
# this data is not in the database yet when were ingesting the glygen data so we need to create a helper file.

#CUI2CUI_genes[['CUI_mouse','CODE_mouse','CodeID_mouse','mouse_symbol']
#             ].to_csv('mouse_symbol_cui_codes.csv',index=False)

### Add inverse ortholog relationships human_ortholog, mouse_ortholog

#### Save CUIs

In [17]:
CUIs_mouse = pd.DataFrame(CUI2CUI_genes['CUI_mouse'].unique(),columns=['CUI_mouse'])


CUIs_mouse.to_pickle(output_dir+'orthologs/CUI_mouse_ortho.pickle')

#### Save CUI-CUIs

In [20]:
CUI2CUI  = CUI2CUI_genes[['CUI_human','CUI_mouse']].rename(columns={'CUI_human':':START_ID','CUI_mouse':':END_ID'})
CUI2CUI[':TYPE'] = 'has_mouse_ortholog'

# Create inverse CUI-CUIC relationship ('has_human_ortholog')
CUI2CUI_inverse = CUI2CUI_genes[['CUI_mouse','CUI_human']].rename(columns={'CUI_mouse':':START_ID','CUI_human':':END_ID'})
CUI2CUI_inverse[':TYPE'] = 'has_human_ortholog'

# Join them together and create SAB
CUI2CUI_all = pd.concat([CUI2CUI,CUI2CUI_inverse])
CUI2CUI_all['SAB'] = 'HGNC__HGNC_HCOP' 
CUI2CUI_all.to_pickle(output_dir+'orthologs/CUI_CUI_ortho.pickle')

#### Save CODEs   

In [21]:
CODEs = CUI2CUI_genes[['CodeID_mouse','CODE_mouse']].drop_duplicates() 
CODEs['SAB'] = 'HGNC_HCOP'
CODEs = CODEs[['CodeID_mouse','SAB','CODE_mouse']] 
CODEs.to_pickle(output_dir+'orthologs/CODE_mouse_ortho.pickle')

#### Save (mouse) CUI-CODE

In [22]:
CUI_CODEs = CUI2CUI_genes[['CUI_mouse','CodeID_mouse']].drop_duplicates()
CUI_CODEs.to_pickle(output_dir+'orthologs/CUI_CODE_ortho.pickle')

### Create SUIs and Save CODE-SUI   (Terms for mouse_name, mouse_symbol and mgi_id columns)

### Need mgi ID or no?

In [23]:

CUI2CUI_genes['SUI_mouse_symbol'] = CUIbase64(CUI2CUI_genes['mouse_symbol'])
CUI2CUI_genes['SUI_mouse_name']  = CUIbase64( CUI2CUI_genes['mouse_name'])
CUI2CUI_genes['SUI_mgi_id'] = CUIbase64(CUI2CUI_genes['mgi_id'])

assert len(CUI2CUI_genes['SUI_mouse_symbol'].unique())  ==  len(CUI2CUI_genes['mouse_symbol'].unique()) 
assert len(CUI2CUI_genes['SUI_mouse_name'].unique())  ==  len(CUI2CUI_genes['mouse_name'].unique()) 
assert len(CUI2CUI_genes['SUI_mgi_id'].unique())  ==  len(CUI2CUI_genes['mgi_id'].unique()) 

Save SUIs

In [24]:
SUI_mouse_symbol = CUI2CUI_genes[['SUI_mouse_symbol','mouse_symbol']].rename(columns={
                                            'SUI_mouse_symbol':'SUI:ID','mouse_symbol':'name'})

SUI_mouse_name = CUI2CUI_genes[['SUI_mouse_name','mouse_name']].rename(columns={
                                            'SUI_mouse_name':'SUI:ID','mouse_name':'name'})

SUI_mgi_id = CUI2CUI_genes[['SUI_mgi_id','mgi_id']].rename(columns={
                                            'SUI_mgi_id':'SUI:ID','mgi_id':'name'})

SUIs_all = pd.concat([SUI_mgi_id,SUI_mouse_name,SUI_mouse_symbol])


SUIs_all.to_pickle(output_dir+'orthologs/SUIs_ortho.pickle')

###  Create and Save CODE-SUIs

columns = :START_ID,:END_ID, :TYPE, CUI  
:START_ID = CUI
:END_ID = SUI

In [25]:
CODE_SUI_mouse_symbol = CUI2CUI_genes[['CodeID_mouse','SUI_mouse_symbol','CUI_mouse']].rename(columns={
                                            'CodeID_mouse':':START_ID','SUI_mouse_symbol':':END_ID','CUI_mouse':'CUI'})
CODE_SUI_mouse_symbol[':TYPE'] = 'gene_symbol'



CODE_SUI_mouse_name = CUI2CUI_genes[['CodeID_mouse','SUI_mouse_name','CUI_mouse']].rename(columns={
                                            'CodeID_mouse':':START_ID','SUI_mouse_name':':END_ID','CUI_mouse':'CUI'})
CODE_SUI_mouse_name[':TYPE'] = 'gene_name'



CODE_SUI_mgi_id = CUI2CUI_genes[['CodeID_mouse','SUI_mgi_id','CUI_mouse']].rename(columns={
                                            'CodeID_mouse':':START_ID','SUI_mgi_id':':END_ID','CUI_mouse':'CUI'})
CODE_SUI_mgi_id[':TYPE'] = 'mgi_id'


CODE_SUI = pd.concat([CODE_SUI_mgi_id,CODE_SUI_mouse_name,CODE_SUI_mouse_symbol])


# Assert that the number of unique values  in 3 columns, when added together have the same number of unique values
#  Subtract 1 bc there is one instance where the gene  name and symbol  are the same, (Taf7l2)
assert  CUI2CUI_genes.nunique()['mouse_name'] + CUI2CUI_genes.nunique()['mouse_symbol'] + \
                                      CUI2CUI_genes.nunique()['mgi_id'] -  1  == CODE_SUI.nunique()[':END_ID']


CODE_SUI = CODE_SUI.drop_duplicates()


CODE_SUI.to_pickle(output_dir+'orthologs/CODE_SUI_ortho.pickle')

#### Save whole df so we can check that there are no collisions with these CUIs and CUIs from the other steps

In [32]:
#CUI2CUI_genes.to_csv('/Users/stearb/desktop/R03_local/data/UI_check/orthologs_uicheck.csv',index=False)