# Notebook #3

Jeffrey Sutherland
8/2/2021

In [1]:
import pandas as pd
import pickle
from collections import defaultdict
import os

In [2]:
PATH_DATA_INPUT = "../data/input/"
PATH_DATA_INTERMEDIATE = "../data/intermediate/"
PATH_DATA_OUTPUT = "../data/output/"
PATH_MODELS = '../data/models/'

read the adverse event (FAERS, SIDER) positive and negatives from the pickled output of make_AE_training_sets.ipynb

In [3]:
with open(PATH_DATA_INTERMEDIATE + 'AE_training_sets.pkl', 'rb') as inf:
    AE_data = pickle.load(inf)

print('Read in AE pickle; top level keys are {}'.format(AE_data.keys()))

Read in AE pickle; top level keys are dict_keys(['SIDER:PT', 'SIDER:HT', 'SIDER:HG', 'FAERS:PT', 'FAERS:HT', 'FAERS:HG', 'SIDER_DRUGS', 'FAERS_DRUGS'])


Read in the mapping from  DrugCentral struct IDs from prescribed drugs (e.g. enalaprilat) to active metabolites when
applicable (e.g. enalaprilat).  

Using nomenclature from DrugCentral, although parent drug normally refers to the prodrug.  Here struct_id is the
ID of the parent (enalapril) and struct_id_for_parent is the ID of the active metabolite (enalaprilat)

In [4]:
drug_map_data = pd.read_csv(PATH_DATA_INPUT + 'parent_to_metabolite_map.txt',
                            usecols=[0,1,3,4], index_col=False, sep='\t',
                            dtype={'struct_id': int, 'struct_id_for_parent': int})
drug_map_data.head(5)

Unnamed: 0,struct_id,struct_name,parent_name,struct_id_for_parent
0,27,azaribine,6-Azauridine,1000178
1,33,artisone acetate,artisone,1000035
2,47,acemetacin,indomethacin,1440
3,60,racecadotril,thiorphan,1000181
4,64,acetylcarnitine,levocarnitine,513


In [5]:
# convert to dict of lists - there can be multiple active metabolites per drug
drug_map = defaultdict(list)
for row in drug_map_data.itertuples():
    drug_map[row.struct_id].append(row.struct_id_for_parent)

We define assay groups, which are assays that are equivalent.  These may be assays that were run in-house vs. CROs
for which compounds tested in both assays show them to be equivalent.  Assay groups may be defined at the level of targets
- merge assays for the same target where assay results are strongly correlated, including different assay types
(binding vs. antagonist)
- or merge only assay results where both target and type are the same (binding vs. binding, antagonist vs. antagonist, etc).
When combining at the target level, antagonist and agonist assays are not correlated so therefore don't get grouped.

In [6]:
assay_combination_data = pd.read_csv(PATH_DATA_INPUT + 'merging_assays_preferred_annotation_pub.txt', sep='\t')
assay_combination_data.head(5)

Unnamed: 0,cluster_name,cluster_assay_ids,assay_id,preferred_assay,merge_type
0,ACHE,"31589, 41631",31589,no,target
1,ACHE,"31589, 41631",41631,yes,target
2,ADRA1A,"3134, 5427, 5494, 18942, 41701",3134,no,target
3,ADRA1A,"3134, 5427, 5494, 18942, 41701",5427,no,target
4,ADRA1A,"3134, 5427, 5494, 18942, 41701",18942,no,target


In [7]:
assay_vs_group = dict()
assay_groups = defaultdict(dict)
for r in assay_combination_data.itertuples():
    if r.merge_type == 'target':  # analysis focus on target-type pairs (don't mix agonist / antagonist)
        continue
    assay_vs_group[r.assay_id] = r.cluster_assay_ids
    assay_groups[r.cluster_assay_ids][r.assay_id] = 1 if r.preferred_assay == 'yes' else 0 

Read in activity data file; at present there are several columns created in Spotfire for analysis there, 
but selection of preferred assay, preferred compound etc is performed here independently

In [8]:
activity_data = pd.read_csv(PATH_DATA_INPUT + 'final_summarized_activity_data_pub.txt',
                            usecols=[0, 1, 2, 3, 5, 6, 7, 25, 26, 27],
                            index_col=False, sep='\t',
                            dtype={'assay_id': int, 'drugcentral_struct_id': int})
activity_data.head(5)

Unnamed: 0,drugcentral_struct_id,struct_match_type,inchi_key,assay_id,assay_group_name,summarized prefix,summarized IC50,cmax_margin,free_cmax_margin,RowId
0,2869,exact,ULSDMUVEXKOYBU-ZDUSSCGKSA-N,10942,PPARG antagonist,>,30.0,581.395349,775.193798,14147
1,128,exact,WKEMJKQOLOHJLZ-UHFFFAOYSA-N,10942,PPARG antagonist,>,30.0,177.514793,300.36344,14372
2,408,exact,ZDIGNSYAACHWNL-UHFFFAOYSA-N,10942,PPARG antagonist,>,30.0,833.333333,2976.190476,14547
3,180,exact,KRMDCWKBEZIMAB-UHFFFAOYSA-N,10942,PPARG antagonist,>,30.0,135.135135,3378.378378,15145
4,1675,exact,GWWLWDURRGNSRS-UHFFFAOYSA-N,10942,PPARG antagonist,>,30.0,,,15355


In [9]:
activity_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121097 entries, 0 to 121096
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   drugcentral_struct_id  121097 non-null  int64  
 1   struct_match_type      121097 non-null  object 
 2   inchi_key              121097 non-null  object 
 3   assay_id               121097 non-null  int64  
 4   assay_group_name       121097 non-null  object 
 5   summarized prefix      121097 non-null  object 
 6   summarized IC50        121097 non-null  float64
 7   cmax_margin            77773 non-null   float64
 8   free_cmax_margin       68650 non-null   float64
 9   RowId                  121097 non-null  int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 9.2+ MB


Many assays are either the only for a given target, or not sufficiently correlated with any other to merge them

These need to be added to the assay_vs_group and assay_group data objects


In [10]:
ares_ids = list(activity_data['assay_id'].unique())
for id in ares_ids:
    if assay_vs_group.get(id, None) is None:
        assay_vs_group[id] = str(id)
        assay_groups[str(id)][id] = 1  # defaulting to the preferred assay, a trivial result for a one assay group

# add the assay group ID to the activity data frame
activity_data['assay_group'] = activity_data['assay_id'].map(assay_vs_group)

print('Number of activity data records with assay group defined {}'.format(len(activity_data['assay_group'].notnull())))

Number of activity data records with assay group defined 121097


## Algorithm for matching AEs for a prescribable product to activity data for the activate substance

There are several levels of selection criteria

In this analysis we use merging at the level of target-type groups (i.e. not mixing binding / antagonist results)

To simplify selection of the best result for a prescription drug from AE resources (FAERS, SIDER) and a given
target-type group (e.g. HTR2A-antagonist), increment a score as follows:

1) We prefer activity data for parent drugs that have activity, or for the active metabolite in cases of drug/prodrug
pairs.  Assign +1000 to the active metabolite in those cases where both the parent prodrug and active metabolite were
tested (i.e. prefer enalaprilat over enalapril).  Sometimes, only the prodrug has activity data and we use this activity
even though sub-optimal
  
2) For a given compound selected in #1, there are sometimes multiple Novartis samples that were tested.  This might
include cases where a sample was racemic but the active drug is chiral.  There are cases where
chemical structures differ slightly, with the most reasonable explanation being that a drawing error was made in
registering a sample at Novarits.  The following hiearchy is used to select a sample for a given assay group vs. 
DrugCentral structure ID

- select perfect matches: the INCHI key obtained on the DrugCentral smiles matches the key from a Novartis smiles (+100)
- try to match without the protonation part of the key (e.g. graph + chirality - drop the part after second dash) (+90)
- try to match using only the graph part of the key (up to first dash), but require a name or synonym match as well (+80) 
- try to match on name or synonym, and review structures.  This normally gets matches where structure drawing errors (+70)
were made, either in DrugCentral or at Novartis registration (more likely) (+60)
- there are a minimal number of remaining matches where there were no registered synonyms at Novartis.  These are
 typically things like natural products that are commonly drawn wrong (+50)

3) When there are multiple results within the assay group, select those for the preferred assay (+10).  Preferred assays were 
selected within each group to ensure that an assay which accounts for most of the data is always selected. i.e. try to
use the same assay as much as possible, and supplement with an equivalent assay within the group when necessary (preferred
assay was not used for this drug)

The arbitrary score assignments above ensure that selections at level 3 don't trump those at levels 1 or 2.

In [11]:
struct_match_score = {
    'exact': 100,
    'chiral': 90,
    'graph and name match': 80,
    'graph and syn match': 80,
    'name match and struct review': 70,
    'high sim and struct review': 60,
    'graph no syns': 50
}

In [12]:
def get_activity_records(drug, assay_grp):
    """
    Function that finds the subset of rows relevant for a given drug vs. assay_grp pair and sorts according to criteria
    described above
    :param drug: drugcentral ID of the administered drug (prodrug, not active metabolite, when relevant)
    :param assay_grp: a string for the given assays which produce similar results (merged assays)
    :return: dataframe sorted by descending priority; returns None if no matches
    """
    query_drugs = dict()
    # these are the prodrugs to be prioritized over the parent (+1000 score to prodrugs)
    if drug_map.get(drug, None) is not None:
        query_drugs = {d: 1000 for d in drug_map[drug]}
    query_drugs[drug] = 0
    
    activity_subset = activity_data[ (activity_data['drugcentral_struct_id'].isin(query_drugs)) & (activity_data['assay_group'] == assay_grp)].copy()
    if len(activity_subset) == 0:
        return None
    elif len(activity_subset) > 1:
        activity_subset['score'] = activity_subset['drugcentral_struct_id'].map(query_drugs) +\
                                   activity_subset['struct_match_type'].map(struct_match_score) +\
                                   activity_data['assay_id'].map(assay_groups[assay_grp])
        activity_subset = activity_subset.sort_values(by='score', ascending=False)    

    return activity_subset

In [13]:
def get_best_activity_record(activity_records):
    """
    Helper function that takes the first row in pandas dataframe from get_activity_records and returns a simple dictionary
    :param activity_records: pandas dataframe from get_activity_records
    :return: dictionary with keys prefix, ic50, cmax_margin, free_cmax_margin
    """
    
    ret = {'prefix': activity_records.iloc[0, 5],
           'ic50':  activity_records.iloc[0, 6],
           'cmax_margin':  activity_records.iloc[0, 7],
           'free_cmax_margin':  activity_records.iloc[0, 8]
           }

    return ret

Demonstrate the get_activity_records function - camazepam (drugcentral ID 469; not to be confused with carbamazepine) is a prodrug for temazepam (drugcentral ID 2585).  The algorithm should pick temazepam activity data)
Both drugs were tested vs. human (ARES assay ID 3134) and rat Alpha1A (ARES assay ID 18942) receptors.  The human assay is preferred.
The two assays are grouped into assay group  '3134, 18942' - it's a string containing comma-separated assay IDs, not a python list

The first row in dataframe is drugcentral ID 2585 (temazepam) vs. assay ID 3134 (human version)

In [14]:
sample1 = get_activity_records(469, '3134, 18942')
sample1.head(5)

Unnamed: 0,drugcentral_struct_id,struct_match_type,inchi_key,assay_id,assay_group_name,summarized prefix,summarized IC50,cmax_margin,free_cmax_margin,RowId,assay_group,score
24229,2585,exact,SEQDDYPDSLOBDC-UHFFFAOYSA-N,3134,ADRA1A Binding (NIBR assay),>,10.0,3.472222,86.805556,894133,"3134, 18942",1101.0
24637,2585,exact,SEQDDYPDSLOBDC-UHFFFAOYSA-N,18942,ADRA1A Binding (NIBR assay),>,10.0,3.472222,86.805556,894208,"3134, 18942",1100.0
23910,469,exact,PXBVEXGRHZFEOF-UHFFFAOYSA-N,3134,ADRA1A Binding (NIBR assay),>,10.0,,,275489,"3134, 18942",101.0
24527,469,exact,PXBVEXGRHZFEOF-UHFFFAOYSA-N,18942,ADRA1A Binding (NIBR assay),>,10.0,,,275572,"3134, 18942",100.0


Another example - beclometasone dipropionate (drugcentral ID 294) vs. beclometasone (id 1000142), also with lower quality structure
matches from the Novartis sample vs. Drugcentral struct for beclometasone (but we prefer that over getting activity data from
the prodrug beclometasone dipropionate)

The first row in dataframe is drugcentral ID 1000142 (beclometasone) vs. assay ID 3088 (lower preference vs. assay 41625,
but 41625 is activity data from beclometasone dipropionate).  Therefore an example where data comes from a lower quality
structural match and lower preference assay, because it's the activity result where the active metabolite was tested

In [15]:
sample2 = get_activity_records(294, '3088, 41625')
sample2.head(5)

Unnamed: 0,drugcentral_struct_id,struct_match_type,inchi_key,assay_id,assay_group_name,summarized prefix,summarized IC50,cmax_margin,free_cmax_margin,RowId,assay_group,score
9897,1000142,graph no syns,NBMKJKDGKREAPL-IJTKQNLUSA-N,3088,SLC6A3 Binding,>,10.0,50000.0,384615.3846,430779,"3088, 41625",1050.0
11357,294,graph and syn match,KUVIULQEHSCUHY-DDRZOEPZSA-N,41625,SLC6A3 Binding,>,30.0,,,835980,"3088, 41625",81.0
10256,294,graph and syn match,KUVIULQEHSCUHY-DDRZOEPZSA-N,3088,SLC6A3 Binding,>,10.0,,,835915,"3088, 41625",80.0


In [16]:
simple_dict = get_best_activity_record(sample1)
print('Best activity record from the first example is {}'.format(simple_dict))

Best activity record from the first example is {'prefix': '>', 'ic50': 10.0, 'cmax_margin': 3.472222222, 'free_cmax_margin': 86.80555556}


In [17]:
unique_ae_drugs = AE_data['SIDER_DRUGS'] | AE_data['FAERS_DRUGS']
print('Number of unique drugs with AE info', len(unique_ae_drugs))

Number of unique drugs with AE info 2365


In [18]:
drug_activity_pairs = defaultdict(dict)
count = 0

# track the RowID() values in the final dataset for comparison back to input data
retained_rowids = set()

for a in assay_groups:

    for d in unique_ae_drugs:
        # function returns a dictionary
        activity_subset = get_activity_records(d, a)
        if activity_subset is None:
            continue
        drug_activity_pairs[a][d] = get_best_activity_record(activity_subset)
        retained_rowids.add(activity_subset.iloc[0, 9])
        count += 1
        if count%1000 == 0:
            print('Done {} drug pairs'.format(count))

print('Number of drug vs. assay group pairs in dictionary: {}'.format(count))

Done 1000 drug pairs
Done 2000 drug pairs
Done 3000 drug pairs
Done 4000 drug pairs
Done 5000 drug pairs
Done 6000 drug pairs
Done 7000 drug pairs
Done 8000 drug pairs
Done 9000 drug pairs
Done 10000 drug pairs
Done 11000 drug pairs
Done 12000 drug pairs
Done 13000 drug pairs
Done 14000 drug pairs
Done 15000 drug pairs
Done 16000 drug pairs
Done 17000 drug pairs
Done 18000 drug pairs
Done 19000 drug pairs
Done 20000 drug pairs
Done 21000 drug pairs
Done 22000 drug pairs
Done 23000 drug pairs
Done 24000 drug pairs
Done 25000 drug pairs
Done 26000 drug pairs
Done 27000 drug pairs
Done 28000 drug pairs
Done 29000 drug pairs
Done 30000 drug pairs
Done 31000 drug pairs
Done 32000 drug pairs
Done 33000 drug pairs
Done 34000 drug pairs
Done 35000 drug pairs
Done 36000 drug pairs
Done 37000 drug pairs
Done 38000 drug pairs
Done 39000 drug pairs
Done 40000 drug pairs
Done 41000 drug pairs
Done 42000 drug pairs
Done 43000 drug pairs
Done 44000 drug pairs
Done 45000 drug pairs
Done 46000 drug pai

In [19]:
# save the data structure as pickled object
with open(PATH_DATA_INTERMEDIATE + 'activity_training_sets_pub.pkl', 'wb') as outf:
    pickle.dump(drug_activity_pairs, outf, pickle.HIGHEST_PROTOCOL)

print('Complete: Pickled training sets stored in activity_training_sets.pkl')

Complete: Pickled training sets stored in activity_training_sets.pkl


In [20]:
# save the list of retained RowIDs to a file for cross-reference
with open(PATH_DATA_INTERMEDIATE + 'retained_row_ids_pub.txt', 'w') as idf:
    idf.writelines([str(x) + '\n' for x in retained_rowids])

print('Number of row IDs from orignal dataset retained: {}'.format(len(retained_rowids)))

Number of row IDs from orignal dataset retained: 87200
