# Notebook #4

Jeffrey Sutherland
5/14/2021

##### Output of this notebook is "redundant" and is post processed in spotifre

In [1]:
import pandas as pd
import numpy as np
import pickle
import numpy
import os
import pprint
from scipy import stats
from sklearn.metrics import auc, roc_curve, roc_auc_score

In [2]:
PATH_DATA_INPUT = "../data/input/"
PATH_DATA_INTERMEDIATE = "../data/intermediate/"
PATH_DATA_OUTPUT = "../data/output/"
PATH_MODELS = '../data/models/'

In [3]:
# flag to get extra output
testmode = False

read the adverse event (FAERS, SIDER) positive and negatives from the pickled output of make_AE_training_sets.ipynb

In [4]:
with open(PATH_DATA_INTERMEDIATE + 'AE_training_sets.pkl', 'rb') as inf:
    AE_data = pickle.load(inf)

print('Read in AE pickle; top level keys are {}'.format(AE_data.keys()))

Read in AE pickle; top level keys are dict_keys(['SIDER:PT', 'SIDER:HT', 'SIDER:HG', 'FAERS:PT', 'FAERS:HT', 'FAERS:HG', 'SIDER_DRUGS', 'FAERS_DRUGS'])


In [5]:
unique_ae_drugs = AE_data['SIDER_DRUGS'] | AE_data['FAERS_DRUGS']
print('Number of unique drugs with AE info', len(unique_ae_drugs))

Number of unique drugs with AE info 2365


read the picked activity dataset from make_AE_vs_activity_dataset.ipynb

In [6]:
with open(PATH_DATA_INTERMEDIATE + 'activity_training_sets_pub.pkl', 'rb') as inf:
    assay_data = pickle.load(inf)

print('Read in assay pickle; number of assay groups at top level: {}'.format(len(assay_data)))

Read in assay pickle; number of assay groups at top level: 168


Function to perform statistical tests between AE labels and activity data.

When performing statistical associations between AEs and activity results, results with IC50 qualifier (and hence
Cmax and free Cmax margin) are censored as follows

- IC50 results: qualifier '>' or '=' and value >= 10 - use the value 10; qualifier '>' and value < 10: exclude 
- Cmax margin: qualifier '>' or '=' and value >= 2 - use the value 2; qualifier '>' and value < 2: exclude
- free Cmax margin: qualifier '>' or '=' and value >= 10 - use the value 10; qualifier '>' and value < 10: exclude

For Cmax and free Cmax margin - the values of 2 and 10 are approximately the 1st quartile among results with qualifier '>'

Howeever, in the paper the cutoffs of 30, 10, and 100 were explored, too.

In [7]:
def run_stat_assoc(param, assay, pos, neg, cutoff_dict={"ic50": 30, "cmax_margin": 10, "free_cmax_margin": 100}):
    
    cutoff = cutoff_dict[param]
    
    result = dict()
    simplified = dict()
    both = pos | neg

    nonqual_vals = 0
    for c in both:
        if assay_data[assay].get(c, None) is None:
            # no data
            continue
        elif assay_data[assay][c][param] is None or numpy.isnan(assay_data[assay][c][param]):
            # happens for Cmax or free Cmax where IC50 was avail but not the exposure, hence these are NaN
            continue
        elif assay_data[assay][c]['prefix'] == '>' and assay_data[assay][c][param] < cutoff:
            continue
        elif assay_data[assay][c][param] > cutoff:
            simplified[c] = cutoff
        else:
            simplified[c] = assay_data[assay][c][param]
            nonqual_vals += 1

    pos_vals = list(map(lambda x: simplified[x], filter(lambda x: simplified.get(x, None) is not None, pos)))
    neg_vals = list(map(lambda x: simplified[x], filter(lambda x: simplified.get(x, None) is not None, neg)))

    # check constraints on min # of pos, neg, after filtering on qualifier
    if len(pos_vals) <10 or len(neg_vals) < 50 or nonqual_vals < 10:
        return None

    result['n_pos'] = len(pos_vals)
    result['n_neg'] = len(neg_vals)
    result['median_pos'] =  np.percentile(pos_vals, 50)
    result['median_neg'] = np.percentile(neg_vals, 50)
    result['Q1_pos'] = np.percentile(pos_vals, 25)
    result['Q1_neg'] = np.percentile(neg_vals, 25)
    result['P10_pos'] = np.percentile(pos_vals, 10)
    result['P10_neg'] = np.percentile(neg_vals, 10)

    # skip further stat computations if none of the quantiles are smaller for drugs with the adverse event vs. those without
    if result['median_pos'] >= result['median_neg'] and result['Q1_pos'] >= result['Q1_neg'] and result['P10_pos'] >= result['P10_neg']:
        return None

    # perform KW-test
    kw = stats.kruskal(pos_vals, neg_vals)
    result['kw_hstat'] = kw[0]
    result['kw_pvalue']= kw[1]

    # get ROC curve
    y_actual = [1 for x in neg_vals] + [2 for x in pos_vals]
    y_pred = [1/x for x in neg_vals + pos_vals]
    # invert the values to ensure that standard interpretation applies (predicted positive above threshold)
    # i.e small IC50 or margin means predicted positive

    result['roc_auc'] = roc_auc_score(y_actual, y_pred)
    result['roc_auc_partial'] = roc_auc_score(y_actual, y_pred, max_fpr=0.2)

    return result

In [8]:
testmode = False
test = [10013457]
sources = ['FAERS', 'SIDER']
types = ['PT', 'HT', 'HG']

# activity values get censored as follows
cutoffs={"ic50": 10, "cmax_margin": 2, "free_cmax_margin": 10}

test_AE_annot = list()

# dataframe to accumulate stats on AE vs. assay pair
allresults = list()

# track the drugs that are 1) positive for AE, 2) below the median (margin or IC50) for positive drugs and 3)
# have KW-pvalue < 1e-05 and either ROC AUC > 0.7
posdrugs = list()

for source in sources:
    for tp in types:
        pair = source + ':' + tp
        if AE_data.get(pair, None) is None:
            print('Failed to find AE info for pair {}; skipping'.format(pair))
            continue
        for code in AE_data[pair]:

            if testmode and code not in test:
                continue

            pos = AE_data[pair][code]['pos']
            neg = AE_data[pair][code]['neg']
            if testmode:
                test_AE_annot += [{'type': tp, 'source': source, 'code': code, 'class': 'pos', 'drug_id': x} for x in pos]
                test_AE_annot += [{'type': tp, 'source': source, 'code': code, 'class': 'neg', 'drug_id': x} for x in neg]

            for a in assay_data:

                # pre-check on having at least 10 positives
                pos_w_data = list(filter(lambda x: assay_data[a].get(x, None) is not None, pos))
                if len(pos_w_data) < 10:
                    continue   

                for param in ['ic50', 'cmax_margin', 'free_cmax_margin']:
                    result = run_stat_assoc(param, a, pos, neg, cutoff_dict=cutoffs)
                    if result is None or result['kw_pvalue'] > 0.1:
                        continue

                    result.update({'source': source, 'type': tp, 'code': code, 'assay': a, 'param': param})
                    allresults.append(result)

                    # collect a list of drugs with activity below the median for drugs positive for AE when the assay vs. AE association is positive
                    if result['kw_pvalue'] < 1e-05 and result['roc_auc'] > 0.7:
                        for c in pos:
                            if assay_data[a].get(c, None) is None:
                                # no data
                                continue
                            elif assay_data[a][c][param] is None or numpy.isnan(assay_data[a][c][param]):
                                # happens for Cmax or free Cmax where IC50 was avail but not the exposure, hence these are NaN
                                continue
                            elif assay_data[a][c]['prefix'] == '=' and assay_data[a][c][param] < result['median_pos']:
                                posdrugs.append({'drug': c, 'assay': a, 'code': code, 'source': pair, 'param': param, 'value': assay_data[a][c][param]})

In [9]:
# write out the results
df_all = pd.DataFrame(allresults)
#df_all.to_csv('AE_vs_assay_associations_10_2_10_v3.csv', index=False)
df_all.to_csv(PATH_DATA_OUTPUT + 'AE_vs_assay_associations_{}_{}_{}.csv'.format(
                cutoffs['ic50'], cutoffs['cmax_margin'], cutoffs['free_cmax_margin']), index=False)

In [10]:
# write out the drugs predicted to have AE via activity
df_posdrugs = pd.DataFrame(posdrugs)
#df_posdrugs.to_csv('AE_vs_assay_vs_pos_drug_triplets_10_2_10_v3.csv', index=False)
df_posdrugs.to_csv(PATH_DATA_OUTPUT + 'AE_vs_assay_associations_{}_{}_{}.csv'.format(
                    cutoffs['ic50'], cutoffs['cmax_margin'], cutoffs['cmax_margin']), index=False)

In [11]:
# writing out content for diagnostic purposes
if testmode:

    # write out the assay dataset for investigation purposes
    assay_out = list()
    for a in assay_data:
        for d in assay_data[a]:
            assay_row = assay_data[a][d]
            assay_row.update({'assay': a, 'drug_id': d})
            assay_out.append(assay_row)

    df_test_1 = pd.DataFrame(assay_out)
    df_test_1.to_csv(PATH_DATA_OUTPUT + 'assay_data_10013457.csv', index=False)

    # write out the two meddra codes of pos/neg assigments
    df_test_2 = pd.DataFrame(test_AE_annot)
    df_test_2.to_csv(PATH_DATA_OUTPUT + 'AE_assignments_10013457.csv', index=False)