In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
from collections import Counter
import hashlib
from gtfparse import read_gtf
import base64
from matplotlib_venn import venn2
from IPython.display import Image
import  matplotlib.pyplot as plt
from umls_utils import get_paths, CUIbase64

pd.set_option('display.max_columns', None)

In [1]:
!jupyter nbconvert --to script hgnc_annos_GENCODE.ipynb
!sed -i '' '/.head(/d' hgnc_annos_GENCODE.py
!sed -i '' '/^# /d' hgnc_annos_GENCODE.py
!sed -i '' '/get_ipython()/d' hgnc_annos_GENCODE.py
!sed -i '' '/print/d' hgnc_annos_GENCODE.py

[NbConvertApp] Converting notebook hgnc_annos_GENCODE.ipynb to script
[NbConvertApp] Writing 9851 bytes to hgnc_annos_GENCODE.py


# This notebook loads gene annotation data from GENCODE (chr/start/end positions and strand) 

In [2]:
# Get paths from config file
config_path = '/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/'

data_dir,helper_data_dir,output_dir,LOCAL_CPU,umls_dir,umls_out_dir = get_paths(config_path)


if not  os.path.isdir(output_dir+'hgnc_annos'):
    os.mkdir(output_dir+'hgnc_annos')
    print('Creating hgnc_annos directory...')



In [4]:
data_dir

'/Users/stearb/Desktop/R03_local/data/use_config/R03_DATA/'

# Load data from GENCODE (https://www.gencodegenes.org/human/)

In [3]:
#genecode_annos_path = '/Users/stearb/Desktop/R03_local/data/hgnc_annotations/gencode.v38.chr_patch_hapl_scaff.basic.annotation.gtf'
df=read_gtf(data_dir+'gencode.v38.chr_patch_hapl_scaff.basic.annotation.gtf')
df.head(3)

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,level,hgnc_id,havana_gene,transcript_id,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,11869,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,,,,,,,,,,,
1,chr1,HAVANA,transcript,11869,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,,,,,
2,chr1,HAVANA,exon,11869,12227,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,DDX11L1,2,HGNC:37102,OTTHUMG00000000961.2,ENST00000456328.2,processed_transcript,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,1.0,ENSE00002234944.1,,,


In [4]:
#a=df[df['gene_name'] == 'SNORA70']
#b = a[a['seqname'].str.startswith('chr')]
#c = b[b['feature'] == 'gene']

In [5]:
cols = ['seqname','feature','start','end','strand','gene_name']
df = df[cols]

# Only take gene annotations (drop rows where feature != gene)
genecode = df[df['feature'] == 'gene'].drop('feature',axis=1)

# Take only rows that are mapped to chromosomes
genecode = genecode[genecode['seqname'].str.startswith('chr')]

genecode.rename(columns={'gene_name':'symbol'},inplace=True)

genecode.head(3)

Unnamed: 0,seqname,start,end,strand,symbol
0,chr1,11869,14409,+,DDX11L1
12,chr1,14404,29570,-,WASH7P
25,chr1,17369,17436,-,MIR6859-1


In [6]:
print(genecode.shape)
genecode['symbol'].nunique()

(60649, 5)


59385

### Load UMLS_HGNC_CUIs -to- HGNC_ID  mappings
Need to get these from the CSVs not the graph

In [7]:

UMLS_CUI_CODEs = pd.read_csv(umls_dir+'CUI-CODEs.csv')

umls_genes = UMLS_CUI_CODEs[UMLS_CUI_CODEs[':END_ID'].str.startswith('HGNC')].rename(
                                    columns={':START_ID':'CUI_hgnc',':END_ID':'HGNC_ID'})
umls_genes.head(3)

Unnamed: 0,CUI_hgnc,HGNC_ID
14900,C0694879,HGNC HGNC:3513
25633,C1332096,HGNC HGNC:546
25636,C1332123,HGNC HGNC:896


### Load HGNC_ID--Gene_name mappings and then merge in HGNC_IDs 

We are Getting these mappings (hgnc CodeID to gene symbol/name) directly from the UMLS CSVs now.  

In [10]:
UMLS_CODE_SUIs = pd.read_csv(umls_dir+'CODE-SUIs.csv')

# Filter for just the HGNC CodeID rows
umls_hgnc = UMLS_CODE_SUIs[UMLS_CODE_SUIs[':START_ID'].isin(umls_genes['HGNC_ID'])]

# Get just the rows that contain relationship type 'ACR', this is where the gene symbol/name is.
umls_hgnc_acr = umls_hgnc[umls_hgnc[':TYPE'] == 'ACR']

umls_hgnc_acr.rename(columns={':START_ID':'CodeID',':END_ID':'SUI:ID'},inplace=True) 

# Read in UMLS SUIs so we can map the name property (where the gene symbol/name is) to the HGNC CodeID, merge on SUI:ID
UMLS_SUIs = pd.read_csv(umls_dir+'SUIs.csv')

# Merge in the names of the HGNC IDs, merge on SUI:ID
umls_hgnc_ids2codes = pd.merge(umls_hgnc_acr,UMLS_SUIs,on='SUI:ID')

# Get just the HGNC CODE, not CodeID
umls_hgnc_ids2codes['HGNC_ID'] = [i.split(' ')[1] for i in umls_hgnc_ids2codes['CodeID']]

# Drop cols we dont need and change col names to match ucsc dataframe so we can merge in the HGNC gene names
umls_hgnc_ids2codes = umls_hgnc_ids2codes.drop(['CodeID','SUI:ID',':TYPE','CUI'],axis=1).rename(
                                                                    columns={'name':'symbol'})
umls_hgnc_ids2codes.head(3)

Unnamed: 0,symbol,HGNC_ID
0,MTHFR,HGNC:7436
1,ADRA1A,HGNC:277
2,CSF1,HGNC:2432


In [11]:
umls_hgnc_ids2codes.shape

(41998, 2)

In [12]:
# Merge in gene names (merge on symbol)
ucsc_hgnc = pd.merge(left=genecode,right=umls_hgnc_ids2codes,how='left',on='symbol').dropna()
ucsc_hgnc.head(3)

Unnamed: 0,seqname,start,end,strand,symbol,HGNC_ID
0,chr1,11869,14409,+,DDX11L1,HGNC:37102
1,chr1,14404,29570,-,WASH7P,HGNC:38034
2,chr1,17369,17436,-,MIR6859-1,HGNC:50039


In [13]:
ucsc_hgnc.shape

(39102, 6)

In [14]:
ucsc_hgnc.nunique()

seqname       25
start      39028
end        38976
strand         2
symbol     38941
HGNC_ID    38941
dtype: int64

In [15]:
#ucsc_hgnc[ucsc_hgnc['symbol'].duplicated()]

In [16]:
#ucsc_hgnc[ucsc_hgnc['symbol'] == 'SNORA70']

In [17]:
#Counter(ucsc_hgnc['symbol']).most_common()
#dict(Counter(ucsc_hgnc['symbol']).most_common())

In [18]:
#Counter(ucsc_hgnc['symbol']).most_common()

# For now just drop rows that have duplicates for 'symbol'

In [19]:
ucsc_hgnc.drop_duplicates('symbol',inplace=True)

In [20]:
# Get just the hgnc code, not the code id
umls_genes['HGNC_ID'] = [i.split(' ')[1] for i in umls_genes['HGNC_ID']]

ucsc_hgnc_umls = pd.merge(left=ucsc_hgnc,right=umls_genes,how='left',on='HGNC_ID')

In [21]:
assert ucsc_hgnc_umls.isna().sum().sum() == 0

In [22]:

ucsc_hgnc_umls.rename(columns={'HGNC_ID':'CodeID_hgnc'})

ucsc_hgnc_umls['CodeID_hgnc'] = ['HGNC ' + i for i in ucsc_hgnc_umls['HGNC_ID']]

ucsc_hgnc_umls.head(3)

Unnamed: 0,seqname,start,end,strand,symbol,HGNC_ID,CUI_hgnc,CodeID_hgnc
0,chr1,11869,14409,+,DDX11L1,HGNC:37102,C2239334,HGNC HGNC:37102
1,chr1,14404,29570,-,WASH7P,HGNC:38034,C2829144,HGNC HGNC:38034
2,chr1,17369,17436,-,MIR6859-1,HGNC:50039,C3815338,HGNC HGNC:50039


In [23]:
# Lets just do '+ strand' and '- strand'
ucsc_hgnc_umls['strand'] = [i+' strand' for i in ucsc_hgnc_umls['strand']]

ucsc_hgnc_umls.rename(columns={'seqname':'chrom'},inplace=True)

### We need to create a 'GENE_LOCATION' Code node for every HGNC Concept node. Instead of having all these gene location Terms (and chromosome and strand Terms)  attached to the HGNC Code node, we want them to come off of this new 'GENE_LOCATION' Code node

In [24]:
ucsc_hgnc_umls['GL_Code'] = ['GL_' + i.split(' ')[1] for i in ucsc_hgnc_umls['CodeID_hgnc']]
ucsc_hgnc_umls['GL_SAB'] = 'GENE_LOCATION'
ucsc_hgnc_umls['GL_CodeID'] = ['GL '+ i for i in ucsc_hgnc_umls['GL_Code']]
ucsc_hgnc_umls.head(3)

Unnamed: 0,chrom,start,end,strand,symbol,HGNC_ID,CUI_hgnc,CodeID_hgnc,GL_Code,GL_SAB,GL_CodeID
0,chr1,11869,14409,+ strand,DDX11L1,HGNC:37102,C2239334,HGNC HGNC:37102,GL_HGNC:37102,GENE_LOCATION,GL GL_HGNC:37102
1,chr1,14404,29570,- strand,WASH7P,HGNC:38034,C2829144,HGNC HGNC:38034,GL_HGNC:38034,GENE_LOCATION,GL GL_HGNC:38034
2,chr1,17369,17436,- strand,MIR6859-1,HGNC:50039,C3815338,HGNC HGNC:50039,GL_HGNC:50039,GENE_LOCATION,GL GL_HGNC:50039


### CUI - CODE (CUI_HGNC - GL_Code)

In [25]:
CUI_CODEs = ucsc_hgnc_umls[['CUI_hgnc','GL_CodeID']].rename(
                        columns={'CUI_hgnc':':START_ID','GL_Code':':END_ID'}).drop_duplicates()

CUI_CODEs.to_csv(output_dir+'hgnc_annos/CUI_CODEs_hgncAnno.csv',index=False)

# CODEs

In [26]:
CODEs = ucsc_hgnc_umls[['GL_CodeID','GL_SAB','GL_Code']].drop_duplicates()

CODEs.to_csv(output_dir+'hgnc_annos/CODEs_hgncAnno.csv',index=False)

### FORMAT CODE-SUIs
Get chromosome SUIs direcrtly from UMLS CSVs

In [27]:
# Filter UMLS SUIs for the chromosome SUIs
chrom_SUIs = UMLS_SUIs.loc[(UMLS_SUIs['name'].str.startswith('chromosome').astype(bool)) & \
                           (UMLS_SUIs['name'].str.len() < 15) & (UMLS_SUIs['name'].str.len() > 10)]

# Drop row if it contains any of these chars/strings
chrom_SUIs = chrom_SUIs[~chrom_SUIs['name'].str.contains('q|p|chromosomes|chromosome g')]

# Just get MT chrom seperately
chrom_SUIs_mito = UMLS_SUIs[UMLS_SUIs['name'] == 'mitochondrial chromosome']

chrom_SUIs = pd.concat([chrom_SUIs,chrom_SUIs_mito]).reset_index(drop=True).rename(columns={'name':'chrom'})

# MUST get Chrom SUIs from CSVs not from file

In [28]:
#### There are already chromosome Terms in UMLS, import there names/SUIs here and merge.
#### MUST GET THIS DATA FROM THE CSVs and not the graph.
chrom_SUIs = pd.read_csv('/Users/stearb/Desktop/R03_local/data/gtex/UMLS_chromosome_SUIs.csv')
chrom_SUIs.rename(columns={'chrom':'chrom'},inplace=True)

chrom = ucsc_hgnc_umls[['chrom','GL_CodeID']]

# Reformat our chromosome strings to match the ones from UMLS so we can merge the SUIs in 
chrom['chrom'] = ['chromosome '+ i.split('chr')[1] for i in chrom['chrom'].str.lower()]
chrom['chrom'].replace('chromosome m','mitochondrial chromosome',inplace=True)

chrom_terms = pd.merge(chrom,chrom_SUIs)

chrom_terms.rename(columns={'chrom':'Term'},inplace=True)

chrom_terms['rel'] = 'on_chromosome'

#assert len(chrom) == len(chrom_terms)
assert chrom_terms.nunique()['Term'] == chrom_terms.nunique()['SUI']

In [29]:
ucsc_hgnc_umls['start'] =ucsc_hgnc_umls['start'].astype(str)
ucsc_hgnc_umls['end'] =ucsc_hgnc_umls['end'].astype(str)

In [30]:
chromstart = ucsc_hgnc_umls[['start','GL_CodeID']]

chromstart['SUI'] = CUIbase64(chromstart['start'])

chromstart['rel'] = 'gene_start_position'

assert chromstart.nunique()['start'] == chromstart.nunique()['SUI']
chromstart.rename(columns={'start':'Term'},inplace=True)

In [31]:
chromend = ucsc_hgnc_umls[['end','GL_CodeID']]

chromend['SUI'] = CUIbase64(chromend['end'])

chromend['rel'] = 'gene_end_position'

assert chromend.nunique()['end'] == chromend.nunique()['SUI']
chromend.rename(columns={'end':'Term'},inplace=True)

In [32]:
strand = ucsc_hgnc_umls[['strand','GL_CodeID']]

strand['SUI'] = CUIbase64(strand['strand'])
strand['rel'] = 'strand'

assert strand.nunique()['strand'] == strand.nunique()['SUI']
strand.rename(columns={'strand':'Term'},inplace=True)

In [33]:
CODE_SUIs = pd.concat([chrom_terms,chromstart,chromend,strand])

assert CODE_SUIs.nunique()['Term'] == CODE_SUIs.nunique()['SUI']

# CODE-SUIs

In [34]:
# Merge in CUIs to the CODE_SUIs file above

CODE_SUIs_merge = pd.merge(left=CODE_SUIs,
              right=ucsc_hgnc_umls[['GL_CodeID','CUI_hgnc']],
              how='inner',
              on='GL_CodeID').drop_duplicates()
CODE_SUIs_merge.head(3)

Unnamed: 0,Term,GL_CodeID,SUI,rel,CUI_hgnc
0,chromosome 1,GL GL_HGNC:37102,S1744963,on_chromosome,C2239334
1,11869,GL GL_HGNC:37102,MTE4Njk=,gene_start_position,C2239334
2,14409,GL GL_HGNC:37102,MTQ0MDk=,gene_end_position,C2239334


In [35]:
CODE_SUIs_merge.rename(columns={'GL_CodeID':':START_ID','SUI':':END_ID','rel':':TYPE','CUI_hgnc':'CUI'},inplace=True)

CODE_SUIs_2 = CODE_SUIs_merge[[':START_ID',':END_ID','CUI',':TYPE']]

CODE_SUIs_2.to_csv(output_dir+'hgnc_annos/CODE_SUIs_hgncAnnos.csv',index=False)

In [55]:
SUIs = CODE_SUIs_merge[[':END_ID','Term']].rename(columns={':END_ID':'SUI:ID','Term':'name'}).drop_duplicates()

# Remove SUIs that are already in the UMLS SUIs file, (that start with 'S'), we dont need to save them.
SUIs = SUIs[~(SUIs['SUI:ID'].str.startswith('S')  & (SUIs['name'].str.contains('chrom')))]

SUIs.to_csv(output_dir+'hgnc_annos/SUIs_hgncAnnos.csv',index=False)