# Notebook #6

Jeffrey Sutherland
3/15/2022

In [1]:
import pandas as pd
import numpy as np
import pickle
import numpy
import os
from collections import defaultdict
from scipy import stats
from sklearn.metrics import auc, roc_curve, roc_auc_score

In [2]:
PATH_DATA_INPUT = "../data/input/"
PATH_DATA_INTERMEDIATE = "../data/intermediate/"
PATH_DATA_OUTPUT = "../data/output/"
PATH_MODELS = '../data/models/'

read tab-delim file of literature-reported target-vs-AE pairs

In [3]:
lit_pairs = pd.read_csv(PATH_DATA_INPUT + 'lit_target_AE_pairs.txt', sep='\t')
lit_pairs.head(5)

Unnamed: 0,gene,literature_direction,meddra_code,meddra_name,meddra_type,combined_effect,Lit_association_ID
0,ACHE,inhibition,10013654,Drug abuse,PT,abuse potential,1
1,ACHE,inhibition,10002974,Apnoea,PT,apnea,2
2,ACHE,inhibition,10003591,Ataxia,PT,ataxia,3
3,ACHE,inhibition,10003840,Autonomic nervous system imbalance,PT,autonomic dysfunction,4
4,ACHE,inhibition,10005734,Blood pressure decreased,PT,blood pressure decreased,5


read file mapping assay groups to genes

In [4]:
assay_gene = pd.read_csv(PATH_DATA_INPUT + 'assay_group_vs_gene_map.txt', sep='\t')
assay_gene.head(5)

Unnamed: 0,assay_group_id,human_gene_symbol
0,31149,ABCB11
1,"31589, 41631",ACHE
2,3127,ADORA1
3,41607,ADORA1
4,3177,ADORA2A


read tab-delim file of meddra PTs mapping to HTs and HGLTs

In [5]:
# this mapping was obtained using the UMLS API to map MedDRA preferred terms to HTs (high level terms) and HGs (high
# level group terms).  Note that in UMLS HT is used instead of the more usual HLT abbreviation, and HG instead of HGLT
parents = pd.read_csv(PATH_DATA_INPUT + 'terms_vs_parents.txt', sep='\t')
parents.head(5)

Unnamed: 0,input_code,input_type,parent_code,parent_name,parent_type,CUI
0,10000050,PT,10017927,Gastrointestinal and hepatobiliary procedural ...,HT,C0852238
1,10000050,PT,10034654,Peritoneal and retroperitoneal fibrosis and ad...,HT,C0851945
2,10000050,PT,10069888,Procedural related injuries and complications NEC,HG,C2748389
3,10000050,PT,10034652,Peritoneal and retroperitoneal conditions,HG,C0851466
4,10000050,PT,10022117,"Injury, poisoning and procedural complications",OS,C0947733


read tab-delim file of meddra codes vs. names (all lookups on meddra code; only for making output easier to review)

In [6]:
meddra_names_data = pd.read_csv(PATH_DATA_INPUT + 'codes_vs_names.txt', sep='\t', index_col='meddra_code')
meddra_names = meddra_names_data.to_dict('index')

read tab-delim file providing targets of drugs in the database

In [7]:
drug_target_pairs = pd.read_csv(PATH_DATA_INPUT + 'drugs_for_lit_targets.txt', sep='\t')
drug_target_pairs.head(5)

Unnamed: 0,struct_id,name,action_type,EntrezGeneSymbol (NSPD),action_type_simple,manual_add
0,927,distigmine,INHIBITOR,ACHE,inhibitor,no
1,2551,tacrine,INHIBITOR,ACHE,inhibitor,no
2,1897,neostigmine,INHIBITOR,ACHE,inhibitor,no
3,946,donepezil,INHIBITOR,ACHE,inhibitor,no
4,2231,pralidoxime,ACTIVATOR,ACHE,activation,no


read the adverse event (FAERS, SIDER) positive and negatives from the pickled output of make_AE_training_sets.ipynb

In [8]:
with open(PATH_DATA_INTERMEDIATE + 'AE_training_sets.pkl', 'rb') as inf:
    AE_data = pickle.load(inf)

print('Read in AE pickle; top level keys are {}'.format(AE_data.keys()))

Read in AE pickle; top level keys are dict_keys(['SIDER:PT', 'SIDER:HT', 'SIDER:HG', 'FAERS:PT', 'FAERS:HT', 'FAERS:HG', 'SIDER_DRUGS', 'FAERS_DRUGS'])


In [9]:
unique_ae_drugs = AE_data['SIDER_DRUGS'] | AE_data['FAERS_DRUGS']
print('Number of unique drugs with AE info', len(unique_ae_drugs))

Number of unique drugs with AE info 2365


read the picked activity dataset from make_AE_vs_activity_dataset.ipynb

In [10]:
with open(PATH_DATA_INTERMEDIATE + 'activity_training_sets_pub.pkl', 'rb') as inf:
    assay_data = pickle.load(inf)

print('Read in assay pickle; number of assay groups at top level: {}'.format(len(assay_data)))

Read in assay pickle; number of assay groups at top level: 168


Function to perform statistical tests between AE labels and activity data.

When performing statistical associations between AEs and activity results, results with IC50 qualifier (and hence
Cmax and free Cmax margin) are censored as follows

- IC50 results: qualifier '>' or '=' and value >= 10 - use the value 10; qualifier '>' and value < 10: exclude 
- Cmax margin: qualifier '>' or '=' and value >= 2 - use the value 2; qualifier '>' and value < 2: exclude
- free Cmax margin: qualifier '>' or '=' and value >= 10 - use the value 10; qualifier '>' and value < 10: exclude

For Cmax and free Cmax margin - the values of 2 and 10 are approximately the 1st quartile among results with qualifier '>'

In [11]:
def run_stat_assoc(param, assay, pos, neg, low_level=False, cutoff_dict={"ic50": 30, "cmax_margin": 10, "free_cmax_margin": 100}):
    
    # activity values get censored as follows
    if low_level:
        cutoff_dict.update({"ic50": 10, "cmax_margin": 2, "free_cmax_margin": 10})
    cutoff = cutoff_dict[param]
        
    result = dict()
    simplified = dict()
    both = pos | neg

    nonqual_vals = 0
    ic50_lt_1um = 0
    ic50_lt_500nm =0
    ic50_lt_100nm = 0

    for c in both:
        if assay_data[assay].get(c, None) is None:
            # no data
            continue
        elif assay_data[assay][c][param] is None or numpy.isnan(assay_data[assay][c][param]):
            # happens for Cmax or free Cmax where IC50 was avail but not the exposure, hence these are NaN
            continue
        elif assay_data[assay][c]['prefix'] == '>' and assay_data[assay][c][param] < cutoff:
            continue
        elif assay_data[assay][c][param] > cutoff:
            simplified[c] = cutoff
        else:
            simplified[c] = assay_data[assay][c][param]

        if assay_data[assay][c]['prefix'] != '>':
            nonqual_vals += 1

            if  assay_data[assay][c]['ic50'] < 1:
                ic50_lt_1um += 1
            if  assay_data[assay][c]['ic50'] < 0.5:
                ic50_lt_500nm += 1
            if  assay_data[assay][c]['ic50'] < 0.1:
                ic50_lt_100nm += 1

    pos_vals = list(map(lambda x: simplified[x], filter(lambda x: simplified.get(x, None) is not None, pos)))
    neg_vals = list(map(lambda x: simplified[x], filter(lambda x: simplified.get(x, None) is not None, neg)))
    all_vals = pos_vals + neg_vals

    # check constraints on min # of pos, neg, after filtering on qualifier
    if len(pos_vals) <10 or len(neg_vals) < 50 or nonqual_vals < 5:
        return None

    result['n_pos'] = len(pos_vals)
    result['n_neg'] = len(neg_vals)
    result['nonqual_vals'] = nonqual_vals
    result['median_pos'] =  np.percentile(pos_vals, 50)
    result['median_neg'] = np.percentile(neg_vals, 50)
    result['Q1_pos'] = np.percentile(pos_vals, 25)
    result['Q1_neg'] = np.percentile(neg_vals, 25)
    result['P10_pos'] = np.percentile(pos_vals, 10)
    result['P10_neg'] = np.percentile(neg_vals, 10)
    result['P5_pos'] = np.percentile(pos_vals, 5)
    result['P5_neg'] = np.percentile(neg_vals, 5)

    # extra quantities used for the Glmnet modelling
    result['median_all'] =  np.percentile(all_vals, 50)
    result['Q1_all'] = np.percentile(all_vals, 25)
    result['P10_all'] = np.percentile(all_vals, 10)
    result['P5_all'] = np.percentile(all_vals, 5)
    result['P2.5_all'] = np.percentile(all_vals, 2.5)
    result['n_ic50_lt_1um'] = ic50_lt_1um
    result['n_ic50_lt_500nm'] = ic50_lt_500nm
    result['n_ic50_lt_100nm'] = ic50_lt_100nm

    # perform KW-test
    try:
        kw = stats.kruskal(pos_vals, neg_vals)
        result['kw_hstat'] = kw[0]
        result['kw_pvalue']= kw[1]
    except ValueError:
        result['kw_hstat'] = 0
        result['kw_pvalue']= 1

    # get ROC curve
    y_actual = [1 for x in neg_vals] + [2 for x in pos_vals]
    y_pred = [1/x for x in neg_vals + pos_vals]
    result['roc_auc'] = roc_auc_score(y_actual, y_pred)

    return result

In [12]:
# dataframe to accumulate stats on AE vs. assay pair
allresults = list()

for row in lit_pairs.itertuples():

    gene = row.gene
    code = row.meddra_code
    litid = row.Lit_association_ID
    assays = list(assay_gene.loc[assay_gene['human_gene_symbol'] == gene, 'assay_group_id'].unique())

    codes = dict()
    mtype = row.meddra_type

    # find all parent HT or HG terms given the current meddra code for this literature pair
    hts = list()
    if mtype == 'PT':
        hts = list(parents.loc[(parents['parent_type'] == 'HT') & (parents['input_code'] == code), 'parent_code'].unique())
        for t in hts:
            codes[t] = 'HT'

    if mtype == 'PT' or mtype == 'HT':
        hgs = list(parents.loc[(parents['parent_type'] == 'HG') & (parents['input_code'] == code), 'parent_code'].unique())
        for t in hgs:
            codes[t] = 'HG'

    # add back any PTs shared by the HT and HG terms of this literature-reported meddra code
    hts_hgs = list(codes)
    for c in hts_hgs:
        pts = list(parents.loc[(parents['input_type'] == 'PT') & (parents['parent_code'] == c), 'input_code'].unique())
        for t in pts:
            codes[t] = 'PT'

    # make sure the literature reported code is first
    allcodes = [code] + list(codes)
    best_pval = None

    for c in codes:

        # determine whether the literature code (code) and this code (c) share a HT (distance 1) or HG (distance 2)
        hts_c = list(parents.loc[(parents['parent_type'] == 'HT') & (parents['input_code'] == c), 'parent_code'].unique())
        intersect = list(set(hts) & set(hts_c))
        if code == c:
            dist = 0
        elif intersect:
            dist = 1
        else:
            dist = 2

        for source in ['FAERS', 'SIDER']:
            tp = codes[c]

            pair = source + ':' + tp
            if AE_data.get(pair, None) is None:
                print('ERROR - Failed to find AE info for pair {}; skipping'.format(pair))
                continue

            if AE_data[pair].get(c, None) is None:
                continue

            pos = AE_data[pair][c]['pos']
            neg = AE_data[pair][c]['neg']

            for a in assays:
                if assay_data.get(a, None) is None:
                    print('ERROR - did not find assay group {}; skipping'.format(a))
                    continue

                # pre-check on having at least 10 positives
                pos_w_data = list(filter(lambda x: assay_data[a].get(x, None) is not None, pos))
                if len(pos_w_data) < 10:
                    continue   

                for level in ['low', 'high']:

                    boollev = True if level == 'low' else False
                    for param in ['ic50', 'cmax_margin', 'free_cmax_margin']:
                        result = run_stat_assoc(param, a, pos, neg, boollev)
                        if result is None:
                            continue

                        # if this is a literature-reported code, i.e. the exact meddra PT for the AE in literature,
                        # record its p-value and require any other terms to exceed the significance
                        if c == code and (best_pval is None or result['kw_pvalue'] < best_pval):
                            best_pval = result['kw_pvalue']

                        # dont' keep a result for a distant  term if it's not signficant
                        if dist == 2 and result['kw_pvalue'] > 0.05:
                            continue
                        # don't keep a result if the term given in literature reviews is more significant
                        elif c != code and best_pval is not None and result['kw_pvalue'] >= best_pval:
                            continue

                        on_target = 0
                        off_target = 0

                        # compile the count of active drugs that are on- vs. off- target
                        for cpd in pos:
                            if assay_data[a].get(cpd, None) is None:
                                # no data
                                continue
                            elif assay_data[a][cpd][param] is None or numpy.isnan(assay_data[a][cpd][param]):
                                # happens for Cmax or free Cmax where IC50 was avail but not the exposure, hence these are NaN
                                continue
                            elif assay_data[a][cpd]['prefix'] == '=' and assay_data[a][cpd][param] < result['median_pos']:
                                drug_targets = list(drug_target_pairs.loc[drug_target_pairs['struct_id'] == cpd, 'EntrezGeneSymbol (NSPD)'].unique())
                                if drug_targets and gene in drug_targets:
                                    on_target += 1
                                elif drug_targets:
                                    off_target += 1

                        result.update({'source': source, 'type': tp, 'lit_code': code, 'actual_code': c, 'actual_type': tp,
                                       'dist': dist, 'assay': a, 'param': param, 'level': level, 'lit_id': litid, 'gene': gene,
                                       'on_target': on_target, 'off_target': off_target,
                                       'lit_name': meddra_names[code]['meddra_name'], 'actual_name': meddra_names[c]['meddra_name']})

                        allresults.append(result)

In [13]:
# write out the results
df1 = pd.DataFrame(allresults)
df1.to_csv(PATH_DATA_OUTPUT + 'lit_AE_vs_assay_associations_complete.csv', index=False)

This collection of related MedDRA codes was assembled by iteratively performing the following work (see next markdown block)

In [14]:
meddra_syns = pd.read_csv(PATH_DATA_INPUT + 'meddra_pair_synonyms_reviewed.txt', sep='\t')
meddra_syns.head(5)

Unnamed: 0,lit_code,related_code,lit_name,related_name
0,10001497,10001540,Agitation,Akathisia
1,10002034,10020969,Anemia,Hypochromic anemia
2,10002855,10001497,Anxiety,Agitation
3,10002855,10001540,Anxiety,Akathisia
4,10002974,10038669,Apnoea,Respiratory arrest


Go through results and list the most significant (by p-value) result for each Lit_association_ID

- among results with dist = 0 (i.e. one of the MedDRA codes from the literature reviews), report the most significant
by p-value among those with ROC AUC >= 0.6
- if no results at dist 0 have p-value <= 1e-03 and ROC AUC >= 0.6, check all MedDRA codes sharing a given HT (dist = 1).
- repeat with other codes sharing a given HG (dist = 2)
- the pairings of literature-reported AE vs. related AE must be reviewed manually as some codes under an HT or HG can
be irrelevant or opposite (e.g. heart rate increased and heart rate decreased share HT level terms)
- only reviewed (validated) distance 1 or 2 terms are reported out as the selected results for a given literature ID
- to cut down on manual review time, run code, review distance 1 terms and add to meddra_syns. Then run again,
review distance 2 terms, add to meddra_syns.  Then final run.  There can be many matches to review at level 2

In [15]:
new_pairs_review = defaultdict(dict)
selected_results = list()

for lid in lit_pairs['Lit_association_ID'].unique():

    results_dist0_no_auc = sorted(filter(lambda x: x['dist'] == 0 and x['lit_id'] == lid, allresults), key=lambda x: x['kw_pvalue'])
    total_tests = len(results_dist0_no_auc)
    best_pval = None
    selected_result = None
    results_dist0 = list(filter(lambda x: x['roc_auc'] >= 0.6, results_dist0_no_auc))

    if results_dist0:
        best_pval = results_dist0[0]['kw_pvalue']

        # already meets the sigificance threshold for Meddra term from literature reviews - no further action
        if best_pval <= 0.001:
            selected_result = results_dist0[0]
            selected_result['total_tests'] = total_tests
            selected_results.append(selected_result)
            continue

    # setup list of meddra codes related to the one from literature - these have been manually reviewed and need to be
    # updated iteratively
    lit_codes = list(lit_pairs.loc[lit_pairs['Lit_association_ID'] == lid, 'meddra_code'].unique())
    valid_related_codes = list(meddra_syns.loc[meddra_syns['lit_code'].isin(lit_codes), 'related_code'].unique())

    for dist in [1, 2]:

        results_distx_no_auc = sorted(filter(lambda x: x['dist'] == dist and x['lit_id'] == lid and x['actual_code'] in valid_related_codes, allresults), key=lambda x: x['kw_pvalue'])
        # only counting the tests for related meddra terms that have been manually reviewed (in valid_related_codes)
        total_tests += len(results_distx_no_auc)

        results_distx = list(filter(lambda x: x['roc_auc'] >= 0.6, results_distx_no_auc))
        if results_distx and (best_pval is None or best_pval > results_distx[0]['kw_pvalue']):
            best_pval = results_distx[0]['kw_pvalue']

            if best_pval <= 0.001:
                selected_result = results_distx[0]
                selected_result['total_tests'] = total_tests
                selected_results.append(selected_result)
                break
            else:
                # just use the first of multiple (if any) meddra codes for this literature result; these are always
                # similar (e.g. drug abuse/dependence per the review becomes 'Drug abuse' and 'Dependence')
                results_distx_not_validated = sorted(filter(lambda x: x['dist'] == dist and x['lit_id'] == lid and x['actual_code'] not in valid_related_codes, allresults), key=lambda x: x['kw_pvalue'])

                for r in results_distx_not_validated:
                    if r['roc_auc'] >= 0.6 and r['kw_pvalue'] <= 0.001:
                        new_pairs_review[r['lit_code']][r['actual_code']] = dist

    if selected_result:
        pass
    # if we end up here, there were no distance 0 match or distance 1-2 matches with validated meddra code pairings
    # that satisfied ROC AUC >= 0.6 and KW p-value <= 0.001.  Take the best distance 0 result by-value if it exists
    elif results_dist0_no_auc:
        selected_result = results_dist0_no_auc[0]
        selected_result['total_tests'] = total_tests
        selected_results.append(selected_result)
    # There can be cases where there were no distance 0 results, i.e. the literature-reported meddra code didn't
    # have at least 10 positives with assay data
    else:
        lit_codes = list(lit_pairs.loc[lit_pairs['Lit_association_ID'] == lid, 'meddra_code'].unique())
        valid_related_codes = list(meddra_syns.loc[meddra_syns['lit_code'].isin(lit_codes), 'related_code'].unique())
        results_distx_no_auc = sorted(filter(lambda x: x['dist'] != 0 and x['lit_id'] == lid and x['actual_code'] in valid_related_codes, allresults), key=lambda x: x['kw_pvalue'])
        if results_distx_no_auc:
            selected_result = results_distx_no_auc[0]
            selected_result['total_tests'] = total_tests
            selected_results.append(selected_result)

In [16]:
# write-out the selected result for each literature reported assay vs. ADR pair
for r in selected_results:
    # add in the meddra names to facilitate review

    if meddra_names.get(r['lit_code'], None) is not None:
        r['lit_name'] = meddra_names[r['lit_code']]['meddra_name']
    else:
        r['lit_name'] = ''

    if meddra_names.get(r['actual_code'], None) is not None:
        r['actual_name'] = meddra_names[r['actual_code']]['meddra_name']
    else:
        r['actual_name'] = ''

df2 = pd.DataFrame(selected_results)
df2.to_csv(PATH_DATA_OUTPUT + 'lit_AE_vs_assay_associations_selected.csv', index=False)

In [17]:
# write-out pairs of literature vs. distance 1 or 2 meddra codes to review as being resonably related
terms_review_list = list()
for code in new_pairs_review:

    name = ''
    if meddra_names.get(code, None) is not None:
        name = meddra_names[code]['meddra_name']

    for relcode in new_pairs_review[code]:

        relname = ''
        if meddra_names.get(relcode, None) is not None:
            relname = meddra_names[relcode]['meddra_name']

        terms_review_list.append({'code': code, 'name': name, 'related_code': relcode, 'related_name': relname, 'dist': new_pairs_review[code][relcode]})

df3 = pd.DataFrame(terms_review_list)
df3.to_csv(PATH_DATA_OUTPUT + 'lit_vs_related_meddra_code_review.csv', index=False)