In [82]:
import csv
import json
import datetime
import pickle

import pandas as pd

In [42]:
k_alias_v_approvedSymbol = pickle.load(open('HGNC/k_alias_v_approvedSymbol.pkl', 'rb'))
hgnc_readme = pickle.load(open('HGNC/README.pkl', 'rb'))

**NOTE**

The tricky thing is what if you have an alias that maps to multiple approved gene symbols (eg `ASP`)?

ASP: ['ASIP', 'ATG5', 'TMPRSS11D', 'ROPN1L', 'ASPM', 'A1CF', 'ASPA']

For now, I skip these and mark them for manual review. There are 2018 genes for which this is the case.

Would it make sense to just choose one of the approved symbols?

In [20]:
issue_aliases = [x for x, aliases in k_alias_v_approvedSymbol.items() if len(aliases) >1]
print(f"There are {len(issue_aliases)} aliases mapped to multiple approved symbols")

print(f"Aliases before removal : {len(k_alias_v_approvedSymbol)}") 
[k_alias_v_approvedSymbol.pop(key) for key in issue_aliases]
print(f"Dictionary after removal of keys : {len(k_alias_v_approvedSymbol)}") 

There are 2018 aliases mapped to multiple approved symbols
Aliases before removal : 97266
Dictionary after removal of keys : 95248


**MAIN FUNCTION**

In [109]:
def resolve_genes(db):
    k_pw_v_genes = pickle.load(open(f'{db}/k_pw_v_genes_unmapped.pkl', 'rb'))
    readme = pickle.load(open(f'{db}/README.pkl', 'rb'))
    readme_logging = {
        'Pathway_db': db,
        'date_accessed': readme['date_accessed'],
        'Date_gene_mapped': datetime.datetime.now().strftime("%b %d %Y %H:%M:%S"),
        'HGNC_date_accessed': hgnc_readme['date_accessed'],
    }
    pathway_change_tracking = {}
    
    k_pw_v_approvedSymbols = {}

    for pw, genes in k_pw_v_genes.items():
        k_pw_v_approvedSymbols[pw] = []
        not_found = []
        changed = {}
        for gene in genes:
            if gene not in k_alias_v_approvedSymbol:
                not_found.append(gene)
                k_pw_v_approvedSymbols[pw].append(gene)
            else:
                approved_symbol = k_alias_v_approvedSymbol[gene][0]
                k_pw_v_approvedSymbols[pw].append(approved_symbol)
                if approved_symbol != gene:
                    changed[gene] = approved_symbol
        pathway_change_tracking[pw] = {
            'len_changed': len(changed),
            'len_not_found': len(not_found),
            'not_found': not_found,
            'changed': changed,
        }
    with open(f'gene_mapped/{db}/README.json', 'w') as fp:
        json.dump(readme_logging, fp)
    pickle.dump(readme_logging, open(f'gene_mapped/{db}/README.pkl', 'wb'))
    pickle.dump(k_pw_v_approvedSymbols, open(f'gene_mapped/{db}/dict.pkl', 'wb'))
    pd.DataFrame.from_dict(pathway_change_tracking).T.to_csv(f'gene_mapped/{db}/changelog.tsv', sep='\t')
    pathway_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in k_pw_v_approvedSymbols.items() ])).T
    pathway_df.insert(0,'Source',db)
    pathway_df.to_csv(f'gene_mapped/{db}/{db}.gmt', sep='\t', header=None)
    return pathway_df

In [111]:
dbs = ['Hallmark', 'KEGG', 'Reactome']
for db in dbs:
    resolve_genes(db)
