In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from collections import Counter
import hashlib
import base64
from umls_utils import get_paths, CUIbase64

pd.set_option('display.max_columns', None)

In [67]:
!jupyter nbconvert --to script Glygen.ipynb 
!sed -i '' '/.head(/d' Glygen.py
!sed -i '' '/^#/d' Glygen.py
!sed -i '' '/get_ipython()/d' Glygen.py
!sed -i '' '/print/d' Glygen.py

[NbConvertApp] Converting notebook Glygen.ipynb to script
[NbConvertApp] Writing 8851 bytes to Glygen.py


# Glygen annotations workflow notebook

# Add 3 Glygen datasets, glycosyltransferases for human and mouse (2 datasets), and the glycans for human and mouse (1 dataset)

### This data relies on the HCOP HGNC data (mouse genes) which is not in the base UMLS graph database, its added in by me. I created a 'helper' file that contains the CUI-CODE-Term mappings for the mouse gene data so that if this glygen data is ingested before the mouse gene data it will not break the script

In [2]:
# Get paths from config file
config_path = '/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/'

data_dir,helper_data_dir,output_dir,LOCAL_CPU,umls_dir,umls_out_dir = get_paths(config_path)


if not  os.path.isdir(output_dir+'glygen_annos'):
    os.mkdir(output_dir+'glygen_annos')
    print('Creating glygen_annos directory...')


### Load in some of the graph CSVs

In [3]:
UMLS_CUI_CODEs = pd.read_csv(umls_dir+'CUI-CODEs.csv')

umls_genes = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HGNC')].rename(
                                    columns={':START_ID':'CUI_hgnc',':END_ID':'HGNC_ID'})

UMLS_CODE_SUIs = pd.read_csv(umls_dir+'CODE-SUIs.csv')

# Filter for just the HGNC CodeID rows
umls_hgnc = UMLS_CODE_SUIs[UMLS_CODE_SUIs[':START_ID'].isin(umls_genes['HGNC_ID'])]


# Get just the rows that contain relationship type 'ACR', this is where the gene symbol/name is.
umls_hgnc_acr = umls_hgnc[umls_hgnc[':TYPE'] == 'ACR']

umls_hgnc_acr.rename(columns={':START_ID':'CodeID',':END_ID':'SUI:ID'},inplace=True) 

# Read in UMLS SUIs so we can map the name property (where the gene symbol/name is) to the HGNC CodeID, merge on SUI:ID
UMLS_SUIs = pd.read_csv(umls_dir+'SUIs.csv')

# Merge in the names of the HGNC IDs, merge on SUI:ID
umls_hgnc_ids2codes = pd.merge(umls_hgnc_acr,UMLS_SUIs,on='SUI:ID')

# Get just the HGNC CODE, not CodeID
umls_hgnc_ids2codes['HGNC_ID'] = [i.split(' ')[1] for i in umls_hgnc_ids2codes['CodeID']]


# Drop cols we dont need and change col names to match ucsc dataframe so we can merge in the HGNC gene names
umls_hgnc_ids2codes = umls_hgnc_ids2codes.drop(['SUI:ID',':TYPE'],axis=1).rename(
                                                                    columns={'name':'symbol'})
umls_hgnc_ids2codes.head(3)

Unnamed: 0,CodeID,CUI,symbol,HGNC_ID
0,HGNC HGNC:7436,C0919427,MTHFR,HGNC:7436
1,HGNC HGNC:277,C1332026,ADRA1A,HGNC:277
2,HGNC HGNC:2432,C1332789,CSF1,HGNC:2432


# Add in Human Glycosyltransferases data
### https://data.glygen.org/GLY_000004

In [4]:
df_GT = pd.read_csv(data_dir+'glycosyltransferase_and_glycans/human/human_protein_glycosyltransferase.csv')
df_GT.head(3)

Unnamed: 0,uniprotkb_canonical_ac,species,status,gene_symbol,uniprotkb_protein_name,ec_number,brenda_ec_number,cazy_gt_family,interpro_id,pfam_ac,go_molecular_function
0,A0PJZ3-1,Homo sapiens (Human),reviewed,GXYLT2,Glucoside xylosyltransferase 2 (EC 2.4.2.42) (...,2.4.2.42,,GT8,IPR002495|IPR029044,PF01501,UDP-xylosyltransferase activity [GO:0035252]
1,A6NG13-1,Homo sapiens (Human),reviewed,MGAT4D,"Alpha-1,3-mannosyl-glycoprotein 4-beta-N-acety...",,,,IPR006759,,acetylglucosaminyltransferase activity [GO:000...
2,A8MXE2-1,Homo sapiens (Human),reviewed,,"Putative UDP-GlcNAc:betaGal beta-1,3-N-acetylg...",2.4.1.-,,,IPR002659,PF01762,acetylgalactosaminyltransferase activity [GO:0...


In [5]:
cols = ['gene_symbol']#,'uniprotkb_canonical_ac','uniprotkb_protein_name','go_molecular_function']

df_GT_select = df_GT[cols].drop_duplicates('gene_symbol').rename(columns={'gene_symbol':'symbol'})

df_GT_select.head(3)

Unnamed: 0,symbol
0,GXYLT2
1,MGAT4D
2,


In [6]:
# we only lose 1 gene in the merge, its the 1 NaN gene_symbol entry in the df_GT df
glyco_df = pd.merge(df_GT_select,umls_hgnc_ids2codes)#.isna().sum()
glyco_df.head(2)

Unnamed: 0,symbol,CodeID,CUI,HGNC_ID
0,GXYLT2,HGNC HGNC:33383,C2239497,HGNC:33383
1,MGAT4D,HGNC HGNC:43619,C1853199,HGNC:43619


In [7]:
#umls_genes['HGNC_ID'] = [i.split(' ')[1] for i in umls_genes['HGNC_ID']]

In [8]:
# Create the HGNC CodeIDs manually, just add 'HGNC ' to the front of the HGNC ID
#glyco_df['CodeID_hgnc'] = ['HGNC ' + i for i in glyco_df['HGNC_ID']]

#glyco_df.head(2)

In [9]:
# Create the actual Term 'name'
glyco_df['name'] =  'Glycosyltransferase'    # pd.Series(glyco_term_name*len(glyco_df))
glyco_df[':TYPE'] = 'is_protein_type'

# Create the Tem unique ID, aka SUI
glyco_df['SUI'] = CUIbase64(glyco_df['name'] )

In [10]:
glyco_CODE_SUIs = glyco_df[['CodeID','SUI','CUI',':TYPE']].rename(columns={'CodeID':':START_ID','SUI':':END_ID'}) 
glyco_CODE_SUIs.head(2)

Unnamed: 0,:START_ID,:END_ID,CUI,:TYPE
0,HGNC HGNC:33383,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,C2239497,is_protein_type
1,HGNC HGNC:43619,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,C1853199,is_protein_type


In [11]:
glyco_SUIs = glyco_df[['SUI','name']].rename(columns={'SUI':'SUI:ID'})

# dropping dups will leave us with just the single term
glyco_SUIs.drop_duplicates(inplace=True)

glyco_SUIs.head(2)

Unnamed: 0,SUI:ID,name
0,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,Glycosyltransferase


# Add in Mouse Glycosyltransferases data
### https://data.glygen.org/GLY_000030

### Get mouse mapping/helper files

In [12]:
mouse_mapping_codes = pd.read_csv(helper_data_dir+'mouse_symbol_cui_codes.csv')
mouse_mapping_codes.rename(columns={'mouse_symbol':'gene_symbol'},inplace=True)
mouse_mapping_codes.head(3)

Unnamed: 0,CUI_mouse,CODE_mouse,CodeID_mouse,gene_symbol
0,SENPUCBIQ09QOkV4dDI=,HCOP:Ext2,HCOP HCOP:Ext2,Ext2
1,SENPUCBIQ09QOkFueGE4,HCOP:Anxa8,HCOP HCOP:Anxa8,Anxa8
2,SENPUCBIQ09QOkF2cHIxYg==,HCOP:Avpr1b,HCOP HCOP:Avpr1b,Avpr1b


In [13]:
mouse_mapping_codes

Unnamed: 0,CUI_mouse,CODE_mouse,CodeID_mouse,gene_symbol
0,SENPUCBIQ09QOkV4dDI=,HCOP:Ext2,HCOP HCOP:Ext2,Ext2
1,SENPUCBIQ09QOkFueGE4,HCOP:Anxa8,HCOP HCOP:Anxa8,Anxa8
2,SENPUCBIQ09QOkF2cHIxYg==,HCOP:Avpr1b,HCOP HCOP:Avpr1b,Avpr1b
3,SENPUCBIQ09QOkNjbDE5,HCOP:Ccl19,HCOP HCOP:Ccl19,Ccl19
4,SENPUCBIQ09QOkdtMjU2NA==,HCOP:Gm2564,HCOP HCOP:Gm2564,Gm2564
...,...,...,...,...
66151,SENPUCBIQ09QOkgyYWwxZw==,HCOP:H2al1g,HCOP HCOP:H2al1g,H2al1g
66152,SENPUCBIQ09QOkgyYWwxYw==,HCOP:H2al1c,HCOP HCOP:H2al1c,H2al1c
66153,SENPUCBIQ09QOkgyYWwxaA==,HCOP:H2al1h,HCOP HCOP:H2al1h,H2al1h
66154,SENPUCBIQ09QOkgyYWwxYQ==,HCOP:H2al1a,HCOP HCOP:H2al1a,H2al1a


In [14]:
# Load mouse Glycosyltransferases
df_GT_mouse = pd.read_csv(data_dir+'glycosyltransferase_and_glycans/mouse/mouse_protein_glycosyltransferase.csv')
df_GT_mouse = df_GT_mouse['gene_symbol']
df_GT_mouse.head(3)

0    Chsy1
1     Ext2
2     Ext1
Name: gene_symbol, dtype: object

In [15]:
df_mouse_merged = pd.merge(mouse_mapping_codes,df_GT_mouse)

In [16]:
df_mouse_merged['name'] =  'Glycosyltransferase'    # pd.Series(glyco_term_name*len(glyco_df))
df_mouse_merged[':TYPE'] = 'is_protein_type'

# Create the Tem unique ID, aka SUI
df_mouse_merged['SUI'] = CUIbase64(df_mouse_merged['name'] )

df_mouse_merged.head(2)

Unnamed: 0,CUI_mouse,CODE_mouse,CodeID_mouse,gene_symbol,name,:TYPE,SUI
0,SENPUCBIQ09QOkV4dDI=,HCOP:Ext2,HCOP HCOP:Ext2,Ext2,Glycosyltransferase,is_protein_type,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==
1,SENPUCBIQ09QOkI0Z2FsdDQ=,HCOP:B4galt4,HCOP HCOP:B4galt4,B4galt4,Glycosyltransferase,is_protein_type,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==


In [17]:
glyco_CODE_SUIs_mouse = df_mouse_merged[['CodeID_mouse','SUI','CUI_mouse',':TYPE']]\
                            .rename(columns=
                                            {'CodeID_mouse':':START_ID',
                                             'SUI'         :':END_ID',
                                             'CUI_mouse'   :'CUI'})

In [68]:
glyco_CODE_SUIs.head(2)

Unnamed: 0,:START_ID,:END_ID,CUI,:TYPE
0,HGNC HGNC:33383,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,C2239497,is_protein_type
1,HGNC HGNC:43619,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,C1853199,is_protein_type


In [41]:
glyco_CODE_SUIs_mouse.head(2)

Unnamed: 0,:START_ID,:END_ID,CUI,:TYPE
0,HCOP HCOP:Ext2,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkV4dDI=,is_protein_type
1,HCOP HCOP:B4galt4,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkI0Z2FsdDQ=,is_protein_type


### Have the glycosyltransferase Term node be the same for humans and mouse, or shouuld there be 2 seperate nodes?

#### can just reuse the human sui here

In [18]:
glyco_SUIs_mouse = df_mouse_merged[['SUI','name']].rename(columns={'SUI':'SUI:ID'}).drop_duplicates()

In [69]:
glyco_SUIs_mouse

Unnamed: 0,SUI:ID,name
0,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,Glycosyltransferase


# Add Glycans annotations, just like the glycosyltransferase annotations above, this dataset will be used to annotate gene Code nodes
#### The glycan dataset contains both human and mouse annotations
#### https://data.glygen.org/GLY_000284

In [19]:
glycans = pd.read_csv(data_dir+'glycosyltransferase_and_glycans/glycan_enzyme.csv')

df_glycans = glycans[['gene_name','species']]


In [20]:
glycan_human = df_glycans[df_glycans['species'] == 'human'].drop_duplicates()
glycan_mouse = df_glycans[df_glycans['species'] == 'mouse'].drop_duplicates()
glycan_mouse.head(2)

Unnamed: 0,gene_name,species
0,Alg2,mouse
5,Fut8,mouse


In [49]:
glycan_mouse

In [21]:
glycan_human_merged = pd.merge(umls_hgnc_ids2codes.rename(columns={'symbol':'gene_name'}),
                                                                 glycan_human)

glycan_human_merged.head(2)

Unnamed: 0,CodeID,CUI,gene_name,HGNC_ID,species
0,HGNC HGNC:23162,C1427958,ALG10,HGNC:23162,human
1,HGNC HGNC:920,C1412715,B3GALT5,HGNC:920,human


In [22]:
glycan_mouse_merged = pd.merge(mouse_mapping_codes.rename(columns={'gene_symbol':'gene_name'})
         ,glycan_mouse)

glycan_mouse_merged.head(2)

Unnamed: 0,CUI_mouse,CODE_mouse,CodeID_mouse,gene_name,species
0,SENPUCBIQ09QOkFsZzY=,HCOP:Alg6,HCOP HCOP:Alg6,Alg6,mouse
1,SENPUCBIQ09QOkRwYWd0MQ==,HCOP:Dpagt1,HCOP HCOP:Dpagt1,Dpagt1,mouse


In [23]:
# change column names to match eachother

glycans_both = pd.concat([glycan_human_merged.rename(columns={'HGNC_ID':'Code'}),
                        glycan_mouse_merged.rename(columns={'CODE_mouse':'Code','CUI_mouse':'CUI',
                                   'CodeID_mouse':'CodeID'}) ])


# Drop unneccessary cols
glycans_both.drop(['gene_name','species'],axis=1,inplace=True)
glycans_both.head(2)

Unnamed: 0,CodeID,CUI,Code
0,HGNC HGNC:23162,C1427958,HGNC:23162
1,HGNC HGNC:920,C1412715,HGNC:920


In [24]:
glycans_both['name'] =  'Glycan'    
glycans_both[':TYPE'] = 'is_protein_type'

# 'Glycan' Term already exists in the UMLS graph db, just set the SUI column to this nodes SUI
# SUI: S20147466
# name: Glycan
glycans_both['SUI'] =   'S20147466'    #CUIbase64(glycans_both['name'] )
glycans_both.head(2)

Unnamed: 0,CodeID,CUI,Code,name,:TYPE,SUI
0,HGNC HGNC:23162,C1427958,HGNC:23162,Glycan,is_protein_type,S20147466
1,HGNC HGNC:920,C1412715,HGNC:920,Glycan,is_protein_type,S20147466


In [25]:
glycans_CODE_SUIs = glycans_both[['CodeID','SUI','CUI',':TYPE']].rename(columns=
                                            {'CodeID':':START_ID',
                                             'SUI'   :':END_ID'})
glycans_CODE_SUIs.head(2)

Unnamed: 0,:START_ID,:END_ID,CUI,:TYPE
0,HGNC HGNC:23162,S20147466,C1427958,is_protein_type
1,HGNC HGNC:920,S20147466,C1412715,is_protein_type


In [26]:
assert glyco_CODE_SUIs[':END_ID'][0] == glyco_CODE_SUIs_mouse[':END_ID'][0]

### Concatenate glycosyltransferase and glycan CODE_SUIs and SUIs df's together

In [27]:
CODE_SUIs_all = pd.concat([glyco_CODE_SUIs,glyco_CODE_SUIs_mouse, glycans_CODE_SUIs])
SUIs_all = pd.concat([glyco_SUIs])  #,glycans_SUIs]) # glyco_SUIs_mouse, same as the human glyco term

assert len(CODE_SUIs_all.columns) == 4
assert len(SUIs_all.columns) == 2

### Save

In [64]:
CODE_SUIs_all.to_csv(output_dir+'glygen_annos/CODE_SUIs_glygenAnnos.csv',index=False)

SUIs_all.to_csv(output_dir+'glygen_annos/SUIs_glygenAnnos.csv',index=False)

In [28]:
SUIs_all

Unnamed: 0,SUI:ID,name
0,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,Glycosyltransferase


In [30]:
CODE_SUIs_all[CODE_SUIs_all[':START_ID'].str.startswith('HCOP')]

Unnamed: 0,:START_ID,:END_ID,CUI,:TYPE
0,HCOP HCOP:Ext2,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkV4dDI=,is_protein_type
1,HCOP HCOP:B4galt4,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkI0Z2FsdDQ=,is_protein_type
2,HCOP HCOP:Galnt6,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkdhbG50Ng==,is_protein_type
3,HCOP HCOP:Galnt6,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkdhbG50Ng==,is_protein_type
4,HCOP HCOP:Galnt3,R2x5Y29zeWx0cmFuc2ZlcmFzZQ==,SENPUCBIQ09QOkdhbG50Mw==,is_protein_type
...,...,...,...,...
53,HCOP HCOP:B3gnt8,S20147466,SENPUCBIQ09QOkIzZ250OA==,is_protein_type
54,HCOP HCOP:Mgat3,S20147466,SENPUCBIQ09QOk1nYXQz,is_protein_type
55,HCOP HCOP:B4galt3,S20147466,SENPUCBIQ09QOkI0Z2FsdDM=,is_protein_type
56,HCOP HCOP:Alg9,S20147466,SENPUCBIQ09QOkFsZzk=,is_protein_type
