In [2]:
import csv
import json
import datetime
import pickle

import pandas as pd

In [3]:
hgnc = json.load(open('HGNC/results.json'))
k_alias_v_approvedSymbol = hgnc['data']
hgnc_readme = hgnc['access_log']

In [4]:
k_alias_v_approvedSymbol

{'A1BG': ['A1BG'],
 'A1BG-AS1': ['A1BG-AS1'],
 'NCRNA00181': ['A1BG-AS1'],
 'A1BGAS': ['A1BG-AS1'],
 'A1BG-AS': ['A1BG-AS1'],
 'FLJ23569': ['A1BG-AS1'],
 'A1CF': ['A1CF'],
 'ACF': ['A1CF'],
 'ASP': ['TMPRSS11D', 'ASPA', 'ASPM', 'ATG5', 'ASIP', 'ROPN1L', 'A1CF'],
 'ACF64': ['A1CF'],
 'ACF65': ['A1CF'],
 'APOBEC1CF': ['A1CF'],
 'A2M': ['A2M'],
 'FWP007': ['A2M'],
 'S863-7': ['A2M'],
 'CPAMD5': ['A2M'],
 'A2M-AS1': ['A2M-AS1'],
 'A2ML1': ['A2ML1'],
 'CPAMD9': ['A2ML1'],
 'FLJ25179': ['A2ML1'],
 'p170': ['A2ML1'],
 'A2ML1-AS1': ['A2ML1-AS1'],
 'A2ML1-AS2': ['A2ML1-AS2'],
 'A2MP1': ['A2MP1'],
 'A2MP': ['A2MP1'],
 'A3GALT2': ['A3GALT2'],
 'A3GALT2P': ['A3GALT2'],
 'IGBS3S': ['A3GALT2'],
 'IGB3S': ['A3GALT2'],
 'A4GALT': ['A4GALT'],
 'P1': ['PRF1', 'A4GALT', 'B3GALNT1'],
 'A14GALT': ['A4GALT'],
 'Gb3S': ['A4GALT'],
 'P(k)': ['A4GALT'],
 'A4GNT': ['A4GNT'],
 'alpha4GnT': ['A4GNT'],
 'AAAS': ['AAAS'],
 'AACS': ['AACS'],
 'FLJ12389': ['AACS'],
 'SUR-5': ['AACS'],
 'ACSF1': ['AACS'],
 'AACSP1': [

**NOTE**

The tricky thing is what if you have an alias that maps to multiple approved gene symbols (eg `ASP`)?

ASP: ['ASIP', 'ATG5', 'TMPRSS11D', 'ROPN1L', 'ASPM', 'A1CF', 'ASPA']

For now, I skip these and mark them for manual review. There are 2018 genes for which this is the case.

Would it make sense to just choose one of the approved symbols?

In [5]:
issue_aliases = [x for x, aliases in k_alias_v_approvedSymbol.items() if len(aliases) >1]
print(f"There are {len(issue_aliases)} aliases mapped to multiple approved symbols")

print(f"Aliases before removal : {len(k_alias_v_approvedSymbol)}") 
[k_alias_v_approvedSymbol.pop(key) for key in issue_aliases]
print(f"Dictionary after removal of keys : {len(k_alias_v_approvedSymbol)}") 

There are 1343 aliases mapped to multiple approved symbols
Aliases before removal : 97268
Dictionary after removal of keys : 95925


**MAIN FUNCTION**

In [6]:
def resolve_genes(db, output_dir='gene_mapped'):
    db_results = json.load(open(f'{db}/results.json'))
    k_pw_v_genes = db_results['data']
    readme = db_results['access_log']
    readme_logging = {
        'Pathway_db': db,
        'db_date_accessed': readme['date_accessed'],                
        'HGNC_date_accessed': hgnc_readme['date_accessed'],
        'Date_gene_mapped': datetime.datetime.now().strftime("%b %d %Y %H:%M:%S"),
    }
    pathway_change_tracking = {}
    
    k_pw_v_approvedSymbols = {}

    for pw, genes in k_pw_v_genes.items():
        k_pw_v_approvedSymbols[pw] = []
        not_found = []
        changed = {}
        for gene in genes:
            if gene not in k_alias_v_approvedSymbol:
                not_found.append(gene)
                k_pw_v_approvedSymbols[pw].append(gene)
            else:
                approved_symbol = k_alias_v_approvedSymbol[gene][0]
                k_pw_v_approvedSymbols[pw].append(approved_symbol)
                if approved_symbol != gene:
                    changed[gene] = approved_symbol
        pathway_change_tracking[pw] = {
            'len_changed': len(changed),
            'len_not_found': len(not_found),
            'not_found': not_found,
            'changed': changed,
        }
    with open(f'{output_dir}/{db}/access_log.json', 'w') as fp:
        json.dump(readme_logging, fp)
    pd.DataFrame.from_dict(pathway_change_tracking).T.to_csv(f'gene_mapped/{db}/changelog.tsv', sep='\t')
    pathway_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in k_pw_v_approvedSymbols.items() ])).T
    pathway_df.insert(0,'Source',db)
    pathway_df.to_csv(f'{output_dir}/{db}/{db}.gmt', sep='\t', header=None)
    print(f'finished: {db} -- {pathway_df.shape[0]} pathways')
    return pathway_df

In [22]:
dbs = ['Hallmark', 'KEGG', 'Reactome']
for db in dbs:
    resolve_genes(db)


finished: Hallmark -- 50 pathways
finished: KEGG -- 322 pathways
finished: Reactome -- 2444 pathways


In [8]:
resolve_genes('PanCancer')

finished: PanCancer -- 17 pathways


Unnamed: 0,Source,0,1,2,3,4,5,6,7,8,...,224,225,226,227,228,229,230,231,232,233
CORE MITOTIC DDR,PanCancer,ADPRS,APEX1,APEX2,CENPS,APLF,APTX,ATAD5,ATM,ATMIN,...,XRCC1,XRCC2,XRCC3,XRCC4,XRCC5,XRCC6,ZGRF1,ZNF451,ZRANB3,ZSWIM7
Direct_repair,PanCancer,ALKBH2,ALKBH3,MGMT,ASCC3,,,,,,...,,,,,,,,,,
Telomere,PanCancer,ACD,CTC1,DCLRE1B,EXO1,OBFC1/STN1,OBFC2B/NABP2,POT1,RTEL1,TEN1,...,,,,,,,,,,
PARP/SSBR/BER,PanCancer,ADPRS,APEX1,APEX2,APLF,APTX,CHD1L,HPF1,LIG1,LIG3,...,,,,,,,,,,
NHEJ,PanCancer,APLF,APTX,ATMIN,C7orf49/MRI/CYREN,C9orf142/PAXX,DCLRE1C,DNTT,DYNLL1,ERCC6L2,...,,,,,,,,,,
MMEJ,PanCancer,POLQ,PARP1,LIG3,,,,,,,...,,,,,,,,,,
NER,PanCancer,CUL4B,CUL4A,DDB1,DDB2,ELOF1,ERCC1,ERCC2,ERCC3,ERCC4,...,,,,,,,,,,
ICL/FA,PanCancer,APITD1/CENPS,CENPX,DCLRE1A,EME1,EME2,ERCC1,ERCC4,FAAP100,FAAP20,...,,,,,,,,,,
HR,PanCancer,AUNIP,BARD1,BRCA2,BRCA1,BLM,BRIP1,RADX,DNA2,EID3,...,,,,,,,,,,
Fork QC/TLS,PanCancer,APEX2,CENPS,ATAD5,ATR,ATRIP,CENPX,CHEK1,CLSPN,DDX11,...,,,,,,,,,,


In [9]:
"""
transform pancancer
"""

pan_cancer = resolve_genes('PanCancer')

finished: PanCancer -- 17 pathways


In [13]:
pan_cancer.drop(columns='Source').T.fillna('').to_csv('/Users/anna/Jupyter/pathway_db_gene_resolution/PanCancer/2021_0211_Pan Cancer DDR gene sets_genemapped215.csv')