In [5]:
import os
import gzip
import re
import json

import pandas

In [None]:
# # Download human entrez gene information
# url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
# ! wget --timestamping --directory-prefix download/ $url
# Provenance:  May 03, 2017

In [4]:
path = os.path.join('download', 'Homo_sapiens.gene_info.gz')

jj = pandas.read_table(gzip.open(path, 'rt'))
jj.head(2)

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20170422
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20170423


In [17]:
# Read Entrez info dataset
path = os.path.join('download', 'Homo_sapiens.gene_info.gz')

with gzip.open(path, 'rt') as read_file:
    matches = re.match(r'#(.+)', next(read_file))
    #matches = re.match(r'#Format: (.+) \(', next(read_file))
    columns = matches.group(1).split('\t')
    gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])

# Restrict to homo sapiens
gene_df = gene_df.query('tax_id == 9606')

len(gene_df)

60304

In [19]:
# extract symbols and xrefs
xref_rows = list()
symbol_rows = list()

for i, series in gene_df.iterrows():
    gene_id = series.GeneID
    
    # symbols
    symbol = series.Symbol
    if pandas.notnull(symbol):
        symbol_rows.append((gene_id, 'symbol', symbol))
    
    # synonyms
    synonyms = series.Synonyms
    if pandas.notnull(synonyms):
        for synonym in synonyms.split('|'):
            symbol_rows.append((gene_id, 'synonym', synonym))

    # xrefs
    dbXrefs = series.dbXrefs
    if pandas.notnull(dbXrefs):
        for xref in dbXrefs.split('|'):
            db, ref = xref.split(':', 1)
            xref_rows.append((gene_id, db, ref))

xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])
xref_df.to_csv('data/xrefs-human.tsv', sep='\t', index=False)

symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])
symbol_df.to_csv('data/symbols-human.tsv', sep='\t', index=False)

In [20]:
xref_df.head()

Unnamed: 0,GeneID,resource,identifier
0,1,MIM,138670
1,1,HGNC,HGNC:5
2,1,Ensembl,ENSG00000121410
3,1,Vega,OTTHUMG00000183507
4,2,MIM,103950


In [21]:
symbol_df.head()

Unnamed: 0,GeneID,type,symbol
0,1,symbol,A1BG
1,1,synonym,A1B
2,1,synonym,ABG
3,1,synonym,GAB
4,1,synonym,HYST2477


In [22]:
# save a select columnset
columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']
select_df = gene_df[columns]
select_df.to_csv('data/genes-human.tsv', sep='\t', index=False)
select_df.head()

Unnamed: 0,tax_id,GeneID,Symbol,chromosome,map_location,type_of_gene,description
0,9606,1,A1BG,19,19q13.43,protein-coding,alpha-1-B glycoprotein
1,9606,2,A2M,12,12p13.31,protein-coding,alpha-2-macroglobulin
2,9606,3,A2MP1,12,12p13.31,pseudo,alpha-2-macroglobulin pseudogene 1
3,9606,9,NAT1,8,8p22,protein-coding,N-acetyltransferase 1
4,9606,10,NAT2,8,8p22,protein-coding,N-acetyltransferase 2


In [23]:
# Compute dictionaries of symbols to GeneIDs and save as jsons
symbol_df = pandas.read_table('data/symbols-human.tsv')

symbol_to_id = dict()
synonym_to_ids = dict()
for i, row in symbol_df.iterrows():
    symbol = row.symbol
    if pandas.isnull(symbol):
        continue
    gene_id = row.GeneID
    if row.type == 'symbol':
        symbol_to_id[symbol] = gene_id
    if row.type == 'synonym':
        synonym_to_ids.setdefault(symbol, list()).append(gene_id)

with open('data/symbols-human.json', 'w') as write_file:
    json.dump(symbol_to_id, write_file, indent=2, sort_keys=True)
with open('data/synonyms-human.json', 'w') as write_file:
    json.dump(synonym_to_ids, write_file, indent=2, sort_keys=True)

### Create a symbol to ID mapping that includes both approved symbols and synonyms

In [24]:
# Synonyms that uniquely map to GeneIDs
symbol_map = {k: v[0] for k, v in synonym_to_ids.items() if len(v) == 1}

# Override synonyms with symbols
symbol_map.update(symbol_to_id)

with open('data/symbol-map.json', 'w') as write_file:
    json.dump(symbol_map, write_file, indent=2, sort_keys=True)

In [31]:
gene_df.query('Symbol == "DUX4L1"')

Unnamed: 0,tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date
9159,9606,22947,DUX4L1,,DUX10|DUX4,HGNC:HGNC:3082,4,4q35.2,double homeobox 4 like 1,pseudo,DUX4L1,double homeobox 4 like 1,O,double homeobox protein 10|double homeobox pro...,20170408


In [34]:
select_df.query('Symbol == "DUX4L1"')

Unnamed: 0,tax_id,GeneID,Symbol,chromosome,map_location,type_of_gene,description
9159,9606,22947,DUX4L1,4,4q35.2,pseudo,double homeobox 4 like 1
