In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

### Process data for clinical trial drugs

## Clinical Trial files(CT)

In [8]:
df_drug_moa_ct = pd.read_csv('drug-action-CT-files/Drug-MOA-CT.csv')
df_drug_target_ct = pd.read_csv('drug-action-CT-files/Drug-Target-CT.csv')
df_drug_indication_ct = pd.read_csv('drug-action-CT-files/Drug-Indication-CT.csv')
df_drug_pathway_ct = pd.read_csv('drug-action-CT-files/Drug-Pathway-CT.csv')

In [4]:
#expanding all the data in second column of the datset
#eg:MOA column has more than 1 value, breaking it down by duplicating the drug name

def explode(df, lst_cols, fill_value='', preserve_index=False):
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    idx_cols = df.columns.difference(lst_cols)
    lens = df[lst_cols[0]].str.len()   
    idx = np.repeat(df.index.values, lens)
    res = (pd.DataFrame({col:np.repeat(df[col].values, lens)for col in idx_cols},index=idx).assign(**{col:np.concatenate(df.loc[lens>0, col].values)for col in lst_cols}))
    if (lens == 0).any():
        res = (res.append(df.loc[lens==0, idx_cols], sort=False).fillna(fill_value))
    res = res.sort_index()
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [26]:
#dropping null values
df_drug_moa_ct = df_drug_moa_ct.dropna()
cols = df_drug_moa_ct.columns
#converting all column values to lowercase
df_drug_moa_ct[cols[0]] = df_drug_moa_ct[cols[0]].apply(lambda x:x.lower())
df_drug_moa_ct[cols[1]] = df_drug_moa_ct[cols[1]].apply(lambda x:x.lower())
df_drug_moa_mod = explode(df_drug_moa_ct.assign(MOA=df_drug_moa_ct.MOA.str.split(',')),'MOA')
df_drug_moa_mod.head()

Unnamed: 0,Name,MOA
0,enalaprilat (dihydrate),angiotensin converting enzyme inhibitor
1,sertraline (hydrochloride),selective serotonin reuptake inhibitor (ssri)
2,clindamycin (hydrochloride),protein synthesis inhibitor
3,ketotifen (fumarate),histamine receptor agonist
4,ketotifen (fumarate),leukotriene receptor antagonist


In [27]:
df_drug_indication_ct = df_drug_indication_ct.dropna()
df_drug_indication_mod = df_drug_indication_ct[(df_drug_indication_ct.Indication!='NA')]
cols = df_drug_indication_mod.columns
df_drug_indication_mod[cols[0]] = df_drug_indication_mod[cols[0]].apply(lambda x:x.lower())
df_drug_indication_mod[cols[1]] = df_drug_indication_mod[cols[1]].apply(lambda x:x.lower())
df_drug_indication_mod = explode(df_drug_indication_mod.assign(Indication=df_drug_indication_mod.Indication.str.split(',')),'Indication')

In [6]:
df_drug_pathway_ct = df_drug_pathway_ct.dropna()
df_drug_pathway_mod = df_drug_pathway_ct[(df_drug_pathway_ct.Pathway!='NA')]
df_drug_pathway_mod = df_drug_pathway_mod[(df_drug_pathway_ct.Pathway!='No')]
cols = df_drug_pathway_mod.columns
df_drug_pathway_mod[cols[0]] = df_drug_pathway_mod[cols[0]].apply(lambda x:x.lower())
df_drug_pathway_mod[cols[1]] = df_drug_pathway_mod[cols[1]].apply(lambda x:x.lower())
df_drug_pathway_mod = explode(df_drug_pathway_mod.assign(Pathway=df_drug_pathway_mod.Pathway.str.split(',')),'Pathway')

In [28]:
df_drug_target_ct = df_drug_target_ct.dropna()
df_drug_target_mod = df_drug_target_ct[(df_drug_target_ct.Target!='NA')]
cols = df_drug_target_mod.columns
df_drug_target_mod[cols[0]] = df_drug_target_mod[cols[0]].apply(lambda x:x.lower())
df_drug_target_mod = explode(df_drug_target_mod.assign(Target=df_drug_target_mod.Target.str.split(',')),'Target')

### Combine with drug downselection data

In [9]:
df_drug_moa_ds = pd.read_csv('drug-action-downselected/drug-MOA.csv')
df_drug_target_ds = pd.read_csv('drug-action-downselected/drug-target.csv')
df_drug_indication_ds = pd.read_csv('drug-action-downselected/drug-indication.csv')
df_drug_pathway_ds = pd.read_csv('drug-action-downselected/drug-pathway.csv')

In [12]:
df_drug_moa = pd.concat([df_drug_moa_ds,df_drug_moa_mod])
df_drug_target = pd.concat([df_drug_target_ds,df_drug_target_mod])
df_drug_indication = pd.concat([df_drug_indication_ds,df_drug_indication_mod])
df_drug_pathway = pd.concat([df_drug_pathway_ds,df_drug_pathway_mod])


### Convert to 0-1 relationship matrices 

In [37]:
cols = df_drug_moa.columns
df_moa = pd.crosstab(df_drug_moa[cols[0]],df_drug_moa[cols[1]])

In [30]:
cols = df_drug_target.columns
df_target = pd.crosstab(df_drug_target[cols[0]],df_drug_target[cols[1]])

In [31]:
cols = df_drug_indication.columns
df_indication = pd.crosstab(df_drug_indication[cols[0]],df_drug_indication[cols[1]])

In [32]:
cols = df_drug_pathway.columns
df_pathway = pd.crosstab(df_drug_pathway[cols[0]],df_drug_pathway[cols[1]])

### Save to csv file

In [33]:
#these files create a crosstab with each drug against each MOA. so if the drug reacts in a specific way, that value is 1
#else its 0 

df_moa.to_csv('df_moa_relationship.csv')
df_target.to_csv('df_target_relationship.csv')
df_indication.to_csv('df_indication_relationship.csv')
df_pathway.to_csv('df_pathway_relationship.csv')

### Preprocess numerical features

In [41]:
#64 numerical features about the drug
textual_features_file = pd.read_csv("all_drug_data_processed.csv")
len(textual_features_file.columns)

64