# Read clinical-trials covid dataset to set "truth"
 match to drugs in DrugBank based on Name <br>
 requires parsing out and cleansing drug-name from the <font color=blue><b>Interventions</b></font> column

### Additional Steps - for future consideration
Consider parsing <font color=blue>Title</font> field to get phase of clinical trial ?<br>
Consider using <font color=blue>URL</font> column to link to FDA web data<br>
<br>
Consider keeping <font color=blue>Status</font> and <font color=blue>Study Results</font>
<br>
See section below for including an external manual re-mapping of drugs

In [None]:
#import other packages
import pandas as pd
import numpy as np
import os
#from itertools import chain
import time
import getpass


In [None]:
print(os.getcwd())
FDAPath = os.getcwd() + '/../data/source/'
CTPath = FDAPath + 'ClinicalTrials/'
DBPath = FDAPath + "DrugBank/"

os.chdir(CTPath)
print(os.getcwd())


In [None]:
# create process to explode columns with multiple delimited values
#  takes an array column with ['drug: drug1', 'drug: drug2', 'other: ']
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [None]:
display("reading: " + CTPath + 'clinicaltrial_covid.csv')
ctrials_df = pd.read_csv(CTPath + 'clinicaltrial_covid.csv')
ctrials_df.dropna(subset=['Interventions'], inplace=True) # if interventions column is blank.  drop row
display(ctrials_df.head())


In [None]:
# would need to parse title for phase


In [None]:
#ctrials_df[['Rank', 'Title', 'Interventions']].set_index(['Rank', 'Title'])['Interventions'].str.split('|')
ctrials_df.set_index(['Rank'], inplace=True)

ctrials_df['s1'] = ctrials_df['Interventions'].str.split('|')
display(ctrials_df.shape)
display(ctrials_df['Interventions'].__class__)
display(ctrials_df['s1'].__class__)
display(ctrials_df.__class__)
display(ctrials_df[['Interventions','s1']].head())


In [None]:
# now create one row for each array element in column s1
ctrials_df = explode(ctrials_df, 's1', preserve_index=True)
ctrials_df.reset_index(inplace=True)
display(ctrials_df.shape)
display(ctrials_df.head())


In [None]:
new = ctrials_df["s1"].str.split(":", n = 1, expand = True) 
#display(new)

ctrials_df["iType"]= new[0].astype('str').str.strip()
ctrials_df["iName"]= new[1].astype('str').str.strip()


In [None]:
del new

# Dropping old Name columns 
ctrials_df.drop(columns =['Interventions', "s1"], inplace = True)


In [None]:
# show count of rows by intervention type - only some should contain small-molecule drugs
#  but the classification is not consistently accurate so use all to find matches
display(pd.DataFrame(ctrials_df.groupby(['iType'])['iType'].count()))


In [None]:
# consider adding index/id of clinical trials input as a column of arrays

# keep only rows with intervention type of "Drug"
ctrials_drugs_df = ctrials_df[ ctrials_df['iType'] == 'Drug' ] 

# check results
display('ctrials_drugs_df shape (rows, columns)', ctrials_drugs_df.shape)
print()

# get input row IDs for each iType + iName
df2 = pd.DataFrame(ctrials_drugs_df[['iType', 'iName', 'index']].groupby(['iType', 'iName']).aggregate(lambda tdf: np.array(tdf.unique())))
df2.columns = ['inputRowIDs']

# counting number of row IDs for each iType + iName
Drug_iNames_df = pd.DataFrame(ctrials_drugs_df.groupby(['iType', 'iName'])['iType'].count())
Drug_iNames_df.columns = ['inputRowCount']
Drug_iNames_df['inputRowIDs'] = df2['inputRowIDs']
Drug_iNames_df.reset_index(inplace=True)
display("drug_iNames_df has one row per intervention name plus descriptive columns:", Drug_iNames_df.shape)
display(Drug_iNames_df[15:20])

# cleanup memory
del df2


In [None]:
# repeat processing above for ALL intervention types
# keep only rows with intervention type of "Drug"
allctrials_drugs_df = ctrials_df 

# check results
display('ctrials_drugs_df shape (rows, columns)', allctrials_drugs_df.shape)
print()

# get input row IDs for each iType + iName
df2 = pd.DataFrame(allctrials_drugs_df[['iName', 'index']].groupby(['iName']).aggregate(lambda tdf: np.array(tdf.unique())))
df2.columns = ['inputRowIDs']

# counting number of row IDs for each iType + iName
allDrug_iNames_df = pd.DataFrame(allctrials_drugs_df.groupby(['iName'])['iName'].count())
allDrug_iNames_df.columns = ['inputRowCount']
allDrug_iNames_df['inputRowIDs'] = df2['inputRowIDs']
allDrug_iNames_df.reset_index(inplace=True)
display("drug_iNames_df has one row per intervention name plus descriptive columns:", allDrug_iNames_df.shape)
display(allDrug_iNames_df[15:20])

# cleanup memory
del df2


# Load synonyms from DrugBank

In [None]:
# app.py

import json

fObj = open(DBPath+'DrugBank_CSVs/aliases.json',)
DBAlias = json.load(fObj)
fObj.close()

print(DBAlias.__class__)
display(DBAlias['DB12466'])
display(DBAlias["DB15327"])
#display(DBAlias["DB00898"]) # prednisone



# match intervention Names against Drug Bank to identify DB-IDs

In [None]:
# search dict in DBAlias format to return ID if element of an array matches
def searchDBAlias(byVal):
    keysList = []
    itemsList = DBAlias.items()
    for item in itemsList:
        if byVal in item[1]:
            keysList.append(item[0])
    return keysList

In [None]:
# this could be sped up if it becomes an issue
start = time.time()
Drug_iNames_df['iDBID'] = Drug_iNames_df['iName'].apply(searchDBAlias)
lapse = time.time() - start 
print("lapse time to match: ", lapse)


In [None]:
Drug_iNames_df['iDBIDCount'] = Drug_iNames_df['iDBID'].astype('str').str.count("'")/2 #.str.shape()
display("Table of DrugBank matches found by intervention name", Drug_iNames_df['iDBIDCount'].value_counts())

#display(Drug_iNames_df['iDBID'].__class__)
#display(Drug_iNames_df['iDBID'].astype('str').__class__)
#display(Drug_iNames_df['iDBID'].astype('str')[250])
#display(Drug_iNames_df['iDBID'].astype('str')[400:410])


In [None]:
display("intervention names matching 2 Drug Bank entries", Drug_iNames_df[Drug_iNames_df['iDBIDCount']==2])


In [None]:
display("sample of intervention names matching 1 Drug Bank entries", Drug_iNames_df[Drug_iNames_df['iDBIDCount']==1].head())


In [None]:
# repeat matching - but for ALL intervention types to see if drugs in drug bank were misclassified
start = time.time()
allDrug_iNames_df['iDBID'] = allDrug_iNames_df['iName'].apply(searchDBAlias)
lapse = time.time() - start 
print("lapse time to match: ", lapse)



In [None]:

allDrug_iNames_df['iDBIDCount'] = allDrug_iNames_df['iDBID'].astype('str').str.count("'")/2 
display(allDrug_iNames_df['iDBIDCount'].value_counts())



including non-drug intervention types increases matches to Drug Bank 
so, ignore intervention type and move forward with ALL records 

# create exception lists for review 
## to inform text cleanup below

In [None]:
allDrug_iNames_df[allDrug_iNames_df['iDBIDCount'] > 0].head()

# Write out ClinicalTrialsTruth.tsv file

DBID, iType, iName (most common? or list ?), inputRows, status, study_results, (parse phase?)

In [None]:
ProposedTruth_df = allDrug_iNames_df[ allDrug_iNames_df['iDBIDCount'] > 0 ].copy()
display(ProposedTruth_df.head())
display(ProposedTruth_df.iDBID.__class__)
display(ProposedTruth_df.iDBID.shape)
display(ProposedTruth_df.iDBID.iloc[1])
display(ProposedTruth_df.iDBID.iloc[1].__class__)

#expand out to one row per intervention-Drubank ID 
s = ProposedTruth_df.apply(lambda x: pd.Series(x['iDBID']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'iDBID'
#display(s)

ProposedTruth_df = ProposedTruth_df.drop('iDBID', axis=1).join(s)
display(ProposedTruth_df[10:30])

In [None]:
ProposedTruth_df.to_csv("ProposedTruth.csv")


# Concept for Future Development

## apply manual override file (for when just typing in the change is easier)

## NOTE: any manual overrides should be entered into a .csv file and treated as data


In [None]:
# add an additional file to INSERT complicated records that contain more than one drug
# 541	Drug	Hydroxychloroquine, Clindamycin, Primaquine - ...


In [None]:
# read exception override file
iName_Overrides_df = pd.read_csv(CTPath + 'ctrials_iname_overrides.csv', delimiter='\t')
display(iName_Overrides_df.head())


In [None]:
# apply exception override file
Drug_iNamesNew_df = pd.merge(Drug_iNames_df, iName_Overrides_df, how='left', on='iName', indicator=True)
display(Drug_iNamesNew_df['_merge'].value_counts())

Drug_iNamesNew_df.rename(columns = {'iName':'iNameOld'}, inplace = True)
Drug_iNamesNew_df = Drug_iNamesNew_df[Drug_iNamesNew_df._merge=='both']
display(Drug_iNamesNew_df.head())


### now match cleaned text into drug-bank

In [None]:
# repeat matching - but for ALL intervention types to see if drugs in drug bank were misclassified
start = time.time()
Drug_iNamesNew_df['iDBID'] = Drug_iNamesNew_df['iNameNew'].apply(searchDBAlias)
lapse = time.time() - start 
print("lapse time to match: ", lapse)



In [None]:

Drug_iNamesNew_df['iDBIDCount'] = Drug_iNamesNew_df['iDBID'].astype('str').str.count("'")/2 
display(Drug_iNamesNew_df['iDBIDCount'].value_counts())



In [None]:
display(Drug_iNamesNew_df.head())