# Notebook #2

Jeffrey Sutherland
4/19/2021

In [1]:
import pandas as pd
import pickle
from collections import defaultdict

In [2]:
PATH_DATA_INPUT = "../data/input/"
PATH_DATA_INTERMEDIATE = "../data/intermediate/"
PATH_DATA_OUTPUT = "../data/output/"
PATH_MODELS = '../data/models/'

# LLR threshold to consider when building FAERS dataset
llr_thres = 2

In [3]:
# dataset which has SIDER mapped to DrugCentral and MedDra PTs
sider = pd.read_csv(PATH_DATA_INPUT + 'final_sider_map_to_drugcentral_meddra.txt', sep='\t')
sider.head(5)


Unnamed: 0,struct_id,drugcentral_name,meddra_PT,UniqueConcatenate(match_type),UniqueConcatenate(stereo_id),UniqueConcatenate(umls_meddra)
0,513,levocarnitine,10000081,ATC+graph match,CID000010917,"C0000729, C0000737"
1,513,levocarnitine,10017999,ATC+graph match,CID000010917,C0687713
2,513,levocarnitine,10001906,ATC+graph match,CID000010917,C0002418
3,513,levocarnitine,10002034,ATC+graph match,CID000010917,C0002871
4,513,levocarnitine,10061428,ATC+graph match,CID000010917,"C0003123, C0232462"


In [4]:
# merged dataset of FAERS from DrugCentral + SIDER; note that a drug without AE in FAERS will not be in the file,
# it doesn't mean no data - it means no AE, assuming it has some other AEs reported

faers = pd.read_csv(PATH_DATA_INPUT + 'combined_faers_sider.txt', sep='\t')

# file includes some rows where drug-AE pair is from SIDER - filter out any rows without LLR - which means they came 
# uniquely from SIDER

faers = faers[faers['llr'].notnull()]
faers.head(5)

Unnamed: 0,drugcentral_faers_table_id,struct_id,meddra_code,llr,llr_threshold,drug_ae,drug_no_ae,no_drug_ae,no_drug_no_ae,AEs_in_DrugCentral,AEs_in_SIDER,drug_AE_pair_in_SIDER,PRR,PRR-lower95%CI
0,119245.0,5165,10076309,3.137952,56.353728,10.0,4753.0,25026.0,4609203.0,1,0,,0.388781,0.209293
1,119246.0,1679,10038661,4.882797,37.338532,3.0,4288.0,1618.0,4633083.0,1,1,0.0,2.002654,0.645463
2,119247.0,1679,10039203,0.925238,37.338532,3.0,4288.0,8406.0,4626295.0,1,1,0.0,0.385474,0.124345
3,119248.0,4969,10002383,0.646111,36.959713,4.0,6609.0,9577.0,4622802.0,1,0,,0.292574,0.109816
4,119249.0,1679,10039424,15.916568,37.338532,8.0,4283.0,2929.0,4631772.0,1,1,0.0,2.95008,1.474865


In [5]:
# this mapping was obtained using the UMLS API to map MedDRA preferred terms to HTs (high level terms) and HGs (high
# level group terms).  Note that in UMLS HT is used instead of the more usual HLT abbreviation, and HG instead of HGLT
parents = pd.read_csv(PATH_DATA_INPUT + 'terms_vs_parents.txt', sep='\t')
parents.head(5)

Unnamed: 0,input_code,input_type,parent_code,parent_name,parent_type,CUI
0,10000050,PT,10017927,Gastrointestinal and hepatobiliary procedural ...,HT,C0852238
1,10000050,PT,10034654,Peritoneal and retroperitoneal fibrosis and ad...,HT,C0851945
2,10000050,PT,10069888,Procedural related injuries and complications NEC,HG,C2748389
3,10000050,PT,10034652,Peritoneal and retroperitoneal conditions,HG,C0851466
4,10000050,PT,10022117,"Injury, poisoning and procedural complications",OS,C0947733


In [6]:
# there can be multiple HT or HG terms for a given PT
all_pts = set(sider['meddra_PT']) | set(faers['meddra_code'])
print('Number of unique MedDRA PTs: {}'.format(len(all_pts)))

term2parent = defaultdict(dict)

for pt in all_pts:
    hts = list(parents.loc[(parents['parent_type'] == 'HT') & (parents['input_code'] == pt), 'parent_code'].unique())
    hgs = list(parents.loc[(parents['parent_type'] == 'HG') & (parents['input_code'] == pt), 'parent_code'].unique())

    if not hts or not hgs:
        print('ERROR, no parent terms found for MedDRA PT {}'.format(pt))
        continue

    term2parent[pt]['HT'] = hts
    term2parent[pt]['HG'] = hgs

    # create map from HT to HG; use the relationships directly returned from UMLS, not those inferred by listing the HGs for HTs, above
    # There can be multiple contexts from a given PT to HT/HG so there's no point mixing up all combos of HT to HG here
    for ht in hts:
        if term2parent.get(ht, None) is None:
             hgs_for_ht = list(parents.loc[(parents['parent_type'] == 'HG') & (parents['input_code'] == ht), 'parent_code'].unique())
             term2parent[ht]['HG'] = hgs_for_ht

Number of unique MedDRA PTs: 12291


In [7]:
sider_drugs = set(sider['struct_id'])
print('Number of SIDER drugs {}'.format(len(sider_drugs)))

faers_drugs = set(faers['struct_id'])
print('Number of FAERS drugs {}'.format(len(faers_drugs)))

Number of SIDER drugs 1281
Number of FAERS drugs 2220


In [8]:
term2parent[10027011]

{'HT': [10039289, 10040797], 'HG': [10047438, 10040792]}

# build out the SIDER positives

In [9]:
data = dict()

data['SIDER:PT'] = defaultdict(dict)
data['SIDER:HT'] = defaultdict(dict)
data['SIDER:HG'] = defaultdict(dict)
data['FAERS:PT'] = defaultdict(dict)
data['FAERS:HT'] = defaultdict(dict)
data['FAERS:HG'] = defaultdict(dict)

data['SIDER_DRUGS'] = sider_drugs
data['FAERS_DRUGS'] = faers_drugs

# iterate through SIDER data and define positives for PT, HT and HG level terms
for row in sider.itertuples():
    
    pt = row.meddra_PT
    if data['SIDER:PT'][pt].get('pos', None) is None:
        data['SIDER:PT'][pt]['pos'] = set()

    data['SIDER:PT'][pt]['pos'].add(row.struct_id)
    
    for ht in term2parent[pt]['HT']:
        if data['SIDER:HT'][ht].get('pos', None) is None:
            data['SIDER:HT'][ht]['pos'] = set()
        data['SIDER:HT'][ht]['pos'].add(row.struct_id)

    for hg in term2parent[pt]['HG']:
        if data['SIDER:HG'][hg].get('pos', None) is None:
            data['SIDER:HG'][hg]['pos'] = set()
        data['SIDER:HG'][hg]['pos'].add(row.struct_id)

# build out SIDER negatives
1) start with HG level terms - if it's not a positive drug, it's a negative

2) move on to HT terms - if not a positive drug, and not a HG positive, it's a negative

3) move on to PT terms - if not a positive drug, and not a HT positive, it's a negative

In [10]:
# create a dataframe for capturing summary results
summary = []

for hg in data['SIDER:HG']:
    subdata = data['SIDER:HG'][hg]
    subdata['neg'] = sider_drugs - subdata['pos']
    summary.append({'source': 'SIDER', 'MedDRA_code': hg, 'term_type': 'HG', 'n_pos': len(subdata['pos']), 'n_neg': len(subdata['neg'])})
    
for ht in data['SIDER:HT']:
    subdata = data['SIDER:HT'][ht]
    negs = set(sider_drugs)
    
    # remove any positives for the HG terms corresponding to this HT term
    if not term2parent[ht]['HG']:
        print('ERROR as there are no given HG terms for this HT term {}'.format(ht))
    
    for hg in term2parent[ht]['HG']:
        negs = negs - data['SIDER:HG'][hg]['pos']

    subdata['neg'] = negs
    summary.append({'source': 'SIDER', 'MedDRA_code': ht, 'term_type': 'HT', 'n_pos': len(subdata['pos']), 'n_neg': len(subdata['neg'])})

for pt in data['SIDER:PT']:
    subdata = data['SIDER:PT'][pt]
    negs = set(sider_drugs)
    
    # remove any positives for the HG terms corresponding to this HT term
    if not term2parent[pt]['HT']:
        print('ERROR as there are no given HT terms for this PT term {}'.format(pt))
    
    for ht in term2parent[pt]['HT']:
        negs = negs - data['SIDER:HT'][ht]['pos']

    subdata['neg'] = negs
    summary.append({'source': 'SIDER', 'MedDRA_code': pt, 'term_type': 'PT', 'n_pos': len(subdata['pos']), 'n_neg': len(subdata['neg'])})

summary_sider  = pd.DataFrame(summary)

In [11]:
summary_sider

Unnamed: 0,source,MedDRA_code,term_type,n_pos,n_neg
0,SIDER,10018012,HG,1148,133
1,SIDER,10047518,HG,668,613
2,SIDER,10002086,HG,596,685
3,SIDER,10003018,HG,790,491
4,SIDER,10018073,HG,1195,86
...,...,...,...,...,...
5917,SIDER,10007766,PT,1,1208
5918,SIDER,10047661,PT,1,1203
5919,SIDER,10037532,PT,1,1138
5920,SIDER,10057426,PT,1,1138


# build out the FAERS positives and negatives (in the dataset)

In [12]:
# iterate through FAERS data and define positives for PT, HT and HG level terms
for row in faers.itertuples():
    
    pt = row.meddra_code

    if  row.llr < llr_thres*row.llr_threshold:
        continue

    if data['FAERS:PT'][pt].get('pos', None) is None:
        data['FAERS:PT'][pt]['pos'] = set()
    
    data['FAERS:PT'][pt]['pos'].add(row.struct_id)

    for ht in term2parent[pt]['HT']:
        if data['FAERS:HT'][ht].get('pos', None) is None:
            data['FAERS:HT'][ht]['pos'] = set()

        if  row.llr >= llr_thres*row.llr_threshold:
            data['FAERS:HT'][ht]['pos'].add(row.struct_id)
    
    for hg in term2parent[pt]['HG']:
        if data['FAERS:HG'][hg].get('pos', None) is None:
            data['FAERS:HG'][hg]['pos'] = set()

        if  row.llr >= llr_thres*row.llr_threshold:
            data['FAERS:HG'][hg]['pos'].add(row.struct_id)

# build out FAERS negatives - same approach as SIDER

In [13]:
# create a dataframe for capturing summary results
summary = []

for hg in data['FAERS:HG']:
    subdata = data['FAERS:HG'][hg]
    subdata['neg'] = faers_drugs - subdata['pos']
    summary.append({'source': 'FAERS', 'MedDRA_code': hg, 'term_type': 'HG', 'n_pos': len(subdata['pos']), 'n_neg': len(subdata['neg'])})

for ht in data['FAERS:HT']:
    subdata = data['FAERS:HT'][ht]
    negs = set(faers_drugs)
    
    # remove any positives for the HG terms corresponding to this HT term
    if not term2parent[ht]['HG']:
        print('ERROR as there are no given HG terms for this HT term {}'.format(ht))
    
    for hg in term2parent[ht]['HG']:
        negs = negs - data['FAERS:HG'][hg]['pos']

    subdata['neg'] = negs
    summary.append({'source': 'FAERS', 'MedDRA_code': ht, 'term_type': 'HT', 'n_pos': len(subdata['pos']), 'n_neg': len(subdata['neg'])})

for pt in data['FAERS:PT']:
    subdata = data['FAERS:PT'][pt]
    negs = set(faers_drugs)
    
    # remove any positives for the HG terms corresponding to this HT term
    if not term2parent[pt]['HT']:
        print('ERROR as there are no given HT terms for this PT term {}'.format(pt))
    
    for ht in term2parent[pt]['HT']:
        negs = negs - data['FAERS:HT'][ht]['pos']

    subdata['neg'] = negs
    summary.append({'source': 'FAERS', 'MedDRA_code': pt, 'term_type': 'PT', 'n_pos': len(subdata['pos']), 'n_neg': len(subdata['neg'])})

summary_faers = pd.DataFrame(summary)    

In [14]:
summary_faers

Unnamed: 0,source,MedDRA_code,term_type,n_pos,n_neg
0,FAERS,10079145,HG,494,1726
1,FAERS,10029305,HG,548,1672
2,FAERS,10039911,HG,194,2026
3,FAERS,10007521,HG,342,1878
4,FAERS,10028593,HG,149,2071
...,...,...,...,...,...
7391,FAERS,10022841,PT,1,2072
7392,FAERS,10052594,PT,1,2097
7393,FAERS,10005600,PT,1,2201
7394,FAERS,10052894,PT,1,2135


In [15]:
# merge sider and faers data
summary = pd.concat([summary_sider, summary_faers], axis=0, ignore_index=True)

# write out summary dataset
summary.to_csv(PATH_DATA_OUTPUT + 'SIDER_FAERS_statistics.csv', index=False)

# save the data structure as pickled opbject
with open(PATH_DATA_INTERMEDIATE + f'AE_training_sets_llr{llr_thres}.pkl', 'wb') as outf:
    pickle.dump(data, outf, pickle.HIGHEST_PROTOCOL)

print('Complete: Pickled training sets stored in AE_training_sets.pkl')

Complete: Pickled training sets stored in AE_training_sets.pkl
