In [12]:
import datetime
import itertools
import json
import pickle

import pandas as pd

In [2]:
url = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_prev_sym&col=gd_aliases&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'

In [3]:
import pandas as pd
cols = ['Previous symbols', 'Alias symbols']
hgnc = pd.read_csv(url, sep='\t')

In [4]:
hgnc.set_index('Approved symbol', inplace=True)

In [5]:
k_symbol_v_aliases = {}
approved_symbols = set(hgnc.index)
for gene, aliases in hgnc.iterrows():
    if not isinstance(gene, str):
        continue
    if gene[0] == ' ':
        gene = gene[1:]
    alias_strings = list(
        itertools.chain(*
                        [x.split(',') for x in aliases if isinstance(x, str)]
                        )
    )
    if not len(alias_strings):
        k_symbol_v_aliases[gene] = [gene]
    else:
        k_symbol_v_aliases[gene] = alias_strings

k_symbol_v_aliases_clean = {}
for gene, aliases in k_symbol_v_aliases.items():
    k_symbol_v_aliases_clean[gene] = [x if x[0] != ' ' else x[1:] for x in aliases]

pickle.dump(k_symbol_v_aliases_clean, open('k_approvedSymbol_v_aliases.pkl', 'wb'))

In [9]:
k_alias_v_official = {}
for gene, aliases in k_symbol_v_aliases_clean.items():
    if gene not in k_alias_v_official:
        k_alias_v_official[gene] = set()
    k_alias_v_official[gene].add(gene)
    for a in aliases:
        if a in approved_symbols:
            k_alias_v_official[a] = {a}
            continue
        if a not in k_alias_v_official:
            k_alias_v_official[a] = set()
        k_alias_v_official[a].add(gene)

final_k_alias_v_official = {gene: list(aliases) for gene, aliases in k_alias_v_official.items()}

In [13]:
now =  datetime.datetime.now()
access_log = {
    'date_accessed': now.strftime("%b %d %Y %H:%M:%S"),
    'num_approved_symbols': len(k_symbol_v_aliases),
    'num_all_symbols': len(final_k_alias_v_official),
}

results = {
    'access_log': access_log,
    'data': final_k_alias_v_official
}

with open(f'results.json', 'w') as fp:
        json.dump(results, fp)

In [14]:
results['access_log']

{'date_accessed': 'Feb 11 2021 10:54:45',
 'num_approved_symbols': 42423,
 'num_all_symbols': 97268}

**Background**

Seurat R toolkit for single cell genomics
https://satijalab.org/seurat/
https://www.rdocumentation.org/packages/Seurat/versions/4.0.0/topics/UpdateSymbolList

"For each symbol passed, we query the HGNC gene names database for current symbols that have the provided symbol as either an alias (alias_symbol) or old (prev_symbol) symbol. All other queries are not supported."

Stuart*, Butler*, et al., Cell 2019 [Seurat V3]