In [38]:
import datetime
import itertools
import pickle

import pandas as pd

In [39]:
url = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_prev_sym&col=gd_aliases&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit'

In [40]:
import pandas as pd
cols = ['Previous symbols', 'Alias symbols']
hgnc = pd.read_csv(url, sep='\t')

In [41]:
hgnc.set_index('Approved symbol', inplace=True)

In [49]:
k_symbol_v_aliases = {}
for gene, aliases in hgnc.iterrows():
    if not isinstance(gene, str):
        continue
    if gene[0] == ' ':
        gene = gene[1:]
    alias_strings = list(
        itertools.chain(*
                        [x.split(',') for x in aliases if isinstance(x, str)]
                        )
    )
    if not len(alias_strings):
        k_symbol_v_aliases[gene] = [gene]
    else:
        k_symbol_v_aliases[gene] = alias_strings

k_symbol_v_aliases_clean = {}
for gene, aliases in k_symbol_v_aliases.items():
    k_symbol_v_aliases_clean[gene] = [x if x[0] != ' ' else x[1:] for x in aliases]

pickle.dump(k_symbol_v_aliases_clean, open('k_approvedSymbol_v_aliases.pkl', 'wb'))

In [53]:
k_alias_v_official = {}
for gene, aliases in k_symbol_v_aliases_clean.items():
    if gene not in k_alias_v_official:
        k_alias_v_official[gene] = set()
    k_alias_v_official[gene].add(gene)
    for a in aliases:
        if a not in k_alias_v_official:
            k_alias_v_official[a] = set()
        k_alias_v_official[a].add(gene)

final_k_alias_v_official = {gene: list(aliases) for gene, aliases in k_alias_v_official.items()}
pickle.dump(
    final_k_alias_v_official,
    open('k_alias_v_approvedSymbol.pkl', 'wb')
)

In [62]:
now =  datetime.datetime.now()
readme_dict = {
    'date_accessed': now.strftime("%b %d %Y %H:%M:%S"),
    'num_approved_symbols': len(k_symbol_v_aliases),
    'num_all_symbols': len(final_k_alias_v_official),
}

pickle.dump(readme_dict, open('HGNC/README.pkl', 'wb'))

**Background**

Seurat R toolkit for single cell genomics
https://satijalab.org/seurat/
https://www.rdocumentation.org/packages/Seurat/versions/4.0.0/topics/UpdateSymbolList

"For each symbol passed, we query the HGNC gene names database for current symbols that have the provided symbol as either an alias (alias_symbol) or old (prev_symbol) symbol. All other queries are not supported."

Stuart*, Butler*, et al., Cell 2019 [Seurat V3]