# **Computational Drug Discovery Pipelines-**
## **Data Collection and Pre-processing**

## **Installing libraries**

Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.

In [1]:
! pip install chembl_webresource_client








## **Importing libraries**

In [2]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for GABA**

In [3]:
# Target search for coronavirus
target = new_client.target
target_query = target.search('Dopamine active transporter')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'Q01959', 'xref_name': None, 'xre...",Homo sapiens,Dopamine transporter,19.0,False,CHEMBL238,"[{'accession': 'Q01959', 'component_descriptio...",SINGLE PROTEIN,9606.0
1,"[{'xref_id': 'Q61327', 'xref_name': None, 'xre...",Mus musculus,Dopamine transporter,19.0,False,CHEMBL2799,"[{'accession': 'Q61327', 'component_descriptio...",SINGLE PROTEIN,10090.0
2,"[{'xref_id': 'P23977', 'xref_name': None, 'xre...",Rattus norvegicus,Dopamine transporter,19.0,False,CHEMBL338,"[{'accession': 'P23977', 'component_descriptio...",SINGLE PROTEIN,10116.0
3,"[{'xref_id': 'P27922', 'xref_name': None, 'xre...",Bos taurus,Dopamine transporter,19.0,False,CHEMBL2986,"[{'accession': 'P27922', 'component_descriptio...",SINGLE PROTEIN,9913.0
4,"[{'xref_id': 'Q9GJT6', 'xref_name': None, 'xre...",Macaca fascicularis,Dopamine transporter,19.0,False,CHEMBL5032,"[{'accession': 'Q9GJT6', 'component_descriptio...",SINGLE PROTEIN,9541.0
...,...,...,...,...,...,...,...,...,...
930,[],Rattus norvegicus,Adenylate cyclase,0.0,False,CHEMBL2095179,"[{'accession': 'D4A3N4', 'component_descriptio...",PROTEIN FAMILY,10116.0
931,[],Homo sapiens,Voltage-gated calcium channel,0.0,False,CHEMBL2363032,"[{'accession': 'O95180', 'component_descriptio...",PROTEIN COMPLEX GROUP,9606.0
932,[],Homo sapiens,Retinoid receptor,0.0,False,CHEMBL2363071,"[{'accession': 'P28702', 'component_descriptio...",PROTEIN FAMILY,9606.0
933,[],Homo sapiens,Cyclin-dependent kinase,0.0,False,CHEMBL3559691,"[{'accession': 'P06493', 'component_descriptio...",PROTEIN FAMILY,9606.0


In [4]:
targets.to_csv('DAT_targets_search_results.csv')

### **Select and retrieve bioactivity data for *Dopamine transporter* (zeroth entry)**

We will assign the first entry (which corresponds to the target protein, *Dopamine transporter* to the ***selected_target*** variable 

In [5]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL238'

Here, we will retrieve only bioactivity data for *Dopamine transporter* (CHEMBL238) that are reported as IC50 values in nM (nanomolar) unit.

In [6]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [7]:
df = pd.DataFrame.from_dict(res)

In [8]:
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,53425,[],CHEMBL671771,Inhibitory activity against [3H]-Dopamine upta...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,373.0
1,,70053,[],CHEMBL671771,Inhibitory activity against [3H]-Dopamine upta...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,405.0
2,,71269,[],CHEMBL671771,Inhibitory activity against [3H]-Dopamine upta...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,3890.0
3,,75844,[],CHEMBL871936,Antagonism of cocaine''s inhibition of [3H]DA ...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,470.0
4,,75845,[],CHEMBL674122,Antagonism of cocaine''s inhibition of [3H]DA ...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,717.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,Active,20723981,[],CHEMBL4510296,Dopamine-uptake assay,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,536.0
3364,,20745381,[],CHEMBL4508895,"Transporter, Dopamine (DAT) Eurofins-Panlabs r...",B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,µM,,,1.2
3365,,20753778,[],CHEMBL4508895,"Transporter, Dopamine (DAT) Eurofins-Panlabs r...",B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,µM,,,2.31
3366,,20760434,[],CHEMBL4508895,"Transporter, Dopamine (DAT) Eurofins-Panlabs r...",B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,µM,,,2.17


In [9]:
df.standard_type.unique()

array(['IC50'], dtype=object)

Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [10]:
df.to_csv('DAT-bioactivity_data_04082021.csv', index=False)

## **For copying files to Google Drive**

Firstly, we need to mount the Google Drive into Colab so that we can have access to our Google adrive from within Colab.

In [11]:
#from google.colab import drive
#drive.mount('/content/gdrive/', force_remount=True)


Next, we create a **data** folder in our **Colab Notebooks** folder on Google Drive.

In [12]:
#! mkdir "/content/gdrive/My Drive/Colab Notebooks/Bioinformatics Project/Drug Discovery/data"

In [13]:
#! cp bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/Bioinformatics Project/Drug Discovery/data"

In [14]:
#! ls -l "/content/gdrive/My Drive/Colab Notebooks/Bioinformatics Project/Drug Discovery/data"

Let's see the CSV files that we have so far.

In [15]:
#! ls

Taking a glimpse of the **bioactivity_data.csv** file that we've just created.

In [16]:
! DAT-bioactivity_data_04082021.csv

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [17]:
df.shape

(3368, 45)

In [18]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,53425,[],CHEMBL671771,Inhibitory activity against [3H]-Dopamine upta...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,373.0
1,,70053,[],CHEMBL671771,Inhibitory activity against [3H]-Dopamine upta...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,405.0
2,,71269,[],CHEMBL671771,Inhibitory activity against [3H]-Dopamine upta...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,3890.0
3,,75844,[],CHEMBL871936,Antagonism of cocaine''s inhibition of [3H]DA ...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,470.0
4,,75845,[],CHEMBL674122,Antagonism of cocaine''s inhibition of [3H]DA ...,F,,,BAO_0000190,BAO_0000019,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,717.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,Active,20723981,[],CHEMBL4510296,Dopamine-uptake assay,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,nM,UO_0000065,,536.0
3364,,20745381,[],CHEMBL4508895,"Transporter, Dopamine (DAT) Eurofins-Panlabs r...",B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,µM,,,1.2
3365,,20753778,[],CHEMBL4508895,"Transporter, Dopamine (DAT) Eurofins-Panlabs r...",B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,µM,,,2.31
3366,,20760434,[],CHEMBL4508895,"Transporter, Dopamine (DAT) Eurofins-Panlabs r...",B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Dopamine transporter,9606,,,IC50,µM,,,2.17


Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein.

## **Data pre-processing of the bioactivity data**

### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**. 

In [19]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

### **Iterate the *molecule_chembl_id* to a list**

In [20]:
df2.molecule_chembl_id

0       CHEMBL1790051
1        CHEMBL370805
2        CHEMBL611999
3        CHEMBL120633
4        CHEMBL120633
            ...      
3363    CHEMBL3609637
3364    CHEMBL4514203
3365    CHEMBL1800685
3366    CHEMBL4521594
3367     CHEMBL561132
Name: molecule_chembl_id, Length: 2451, dtype: object

In [21]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

In [22]:
#mol_cid

### **Iterate *canonical_smiles* to a list**

In [23]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

In [24]:
#canonical_smiles

### **Iterate *standard_value* to a list**

In [25]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

In [26]:
#standard_value

### **Combine the 3 columns into a dataframe**

In [27]:
#data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
#df3_alt = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

In [28]:
#df3_alt

### **Alternative Method**

In [29]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]

In [38]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL1790051,COC(=O)[C@@H]1C2CC[C@H](C[C@@H]1c1ccccc1)N2C.Cl,373.0
1,CHEMBL370805,COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[...,405.0
2,CHEMBL611999,,3890.0
3,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,470.0
4,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,717.0
...,...,...,...
3363,CHEMBL3609637,COc1cc(-c2cn(C3CCc4c(F)cccc4N(CC(F)(F)F)C3=O)n...,536.0
3364,CHEMBL4514203,O[C@]1(C(F)(F)F)CCCC[C@H]1Nc1ccc(F)cc1,1.2
3365,CHEMBL1800685,O=C(O)c1cc(-c2ccc(C3CCNCC3)cc2)c2ccc(-c3ccc(C(...,2.31
3366,CHEMBL4521594,NS(=O)(=O)c1cc(NC(=O)Cc2ccccc2)ccc1Oc1cccc(Cl)c1,2.17


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL1790051,COC(=O)[C@@H]1C2CC[C@H](C[C@@H]1c1ccccc1)N2C.Cl,373.0
1,CHEMBL370805,COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[...,405.0
3,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,470.0
4,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,717.0
5,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,1161.0
...,...,...,...
3363,CHEMBL3609637,COc1cc(-c2cn(C3CCc4c(F)cccc4N(CC(F)(F)F)C3=O)n...,536.0
3364,CHEMBL4514203,O[C@]1(C(F)(F)F)CCCC[C@H]1Nc1ccc(F)cc1,1.2
3365,CHEMBL1800685,O=C(O)c1cc(-c2ccc(C3CCNCC3)cc2)c2ccc(-c3ccc(C(...,2.31
3366,CHEMBL4521594,NS(=O)(=O)c1cc(NC(=O)Cc2ccccc2)ccc1Oc1cccc(Cl)c1,2.17


In [41]:
bioactivity_class= pd.Series(bioactivity_class, name = 'bioactivity_class')
df4 = pd.concat([df3, bioactivity_class], axis = 1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL1790051,COC(=O)[C@@H]1C2CC[C@H](C[C@@H]1c1ccccc1)N2C.Cl,373.0,active
1,CHEMBL370805,COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[...,405.0,active
2,CHEMBL611999,,3890.0,intermediate
3,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,470.0,active
4,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,717.0,active
...,...,...,...,...
3363,CHEMBL3609637,COc1cc(-c2cn(C3CCc4c(F)cccc4N(CC(F)(F)F)C3=O)n...,536.0,
3364,CHEMBL4514203,O[C@]1(C(F)(F)F)CCCC[C@H]1Nc1ccc(F)cc1,1.2,
3365,CHEMBL1800685,O=C(O)c1cc(-c2ccc(C3CCNCC3)cc2)c2ccc(-c3ccc(C(...,2.31,
3366,CHEMBL4521594,NS(=O)(=O)c1cc(NC(=O)Cc2ccccc2)ccc1Oc1cccc(Cl)c1,2.17,


In [45]:
df5=df4.dropna()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL1790051,COC(=O)[C@@H]1C2CC[C@H](C[C@@H]1c1ccccc1)N2C.Cl,373.0,active
1,CHEMBL370805,COC(=O)[C@H]1[C@@H](OC(=O)c2ccccc2)C[C@@H]2CC[...,405.0,active
3,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,470.0,active
4,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,717.0,active
5,CHEMBL120633,COC(=O)[C@@H]1C2CCC(C[C@@H]1C(=O)Oc1ccccc1)N2C...,1161.0,intermediate
...,...,...,...,...
2413,CHEMBL91,Clc1ccc(COC(Cn2ccnc2)c2ccc(Cl)cc2Cl)c(Cl)c1,1701.0,inactive
2414,CHEMBL1373,NC(=O)C[S+]([O-])C(c1ccccc1)c1ccccc1,1832.0,intermediate
2419,CHEMBL19215,CN1C[C@H](CNC(=O)OCc2ccccc2)C[C@@H]2c3cccc4c3c...,63.0,inactive
2424,CHEMBL1200633,CC[C@H](C)[C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]1C[C@...,4455.2,intermediate


Saves dataframe to CSV file

In [46]:
df5.to_csv('DAT-bioactivity_preprocessed_data_04082021.csv', index=False)

In [34]:
#! ls -l

For copying to the Google Drive

In [35]:
#! cp bioactivity_preprocessed_data.csv "/content/gdrive/My Drive/Colab Notebooks/Bioinformatics Project/Drug Discovery/data"

In [36]:
#! ls "/content/gdrive/My Drive/Colab Notebooks/Bioinformatics Project/Drug Discovery/data"

---