### **Install ChEMBL client for getting the dataset**

#### **https://www.ebi.ac.uk/chembl/**

In [None]:
!pip install chembl_webresource_client

### **Import Libraries**

In [3]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

### **Find Coronavirus Dataset**

#### **Search Target**

In [6]:
target = new_client.target
target_query = target.search ('coronavirus')
targets = pd.DataFrame.from_dict (target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


#### **Fetch Bio-Activity data for SARS Coronavirus 3C-like proteinase**

In [8]:
selected_target = targets.target_chembl_id [4]
selected_target

'CHEMBL3927'

In [16]:
activity = new_client.activity
res = activity.filter (target_chembl_id = selected_target).filter (standard_type = "IC50")

#### **A Higher Standard Value means we'll require more amount of the drug for same inhibition**

In [17]:
df = pd.DataFrame.from_dict (res)
df.head (3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5


In [18]:
df.standard_type.unique ()

array(['IC50'], dtype=object)

##### **Save the resulting Bio-Activity data to a CSV file**

In [19]:
df.to_csv ('bioactivity_data.csv', index = False)

### **Pre-Processing Data**

#### **Missing data**

In [20]:
df2 = df [df.standard_value.notna ()]
df2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,,12041507,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.6
129,,12041508,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.1
130,,12041509,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,11.5
131,,12041510,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.7


#### **Label Compounds as active or inactive**
##### Compounds with IC50 less than 1000nM are considered active, greater than 10000nM are considered to be inactive, in between 1000nM to 10000nM are considered intermediate
##### 1. IC50 value of the drug indicates the toxicity of the drug to other disease causing organisms.
##### 2. IC50 is a quantitative measure that shows how much a particular inhibitory drug/substance/extract/fraction is needed to inhibit a biological component by 50%.
###### Above Definition taken from https://www.researchgate.net/post/What-is-the-significance-of-IC50-value-when-the-drug-is-exogenously-administered-to-an-animal-tissue

In [27]:
bioactivity_class = []
for i in df2.standard_value :
    if float (i) >= 10000 :
        bioactivity_class.append ("inactive")
    elif float (i) <= 1000 :
        bioactivity_class.append ("active")
    else :
        bioactivity_class.append ("intermediate")
        
bioactivity_class [0:5]

['intermediate', 'intermediate', 'inactive', 'inactive', 'intermediate']

#### **Append Chembl ID, Canonical Smiles and Standard Value to a list**
##### Canonical Smiles :-
##### 1. Simplified Molecular Input Line Entry Specification
##### 2. They can represent a Molecular Compound in a Single Line

In [34]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2 [selection]
df3 [0:5]

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0


In [48]:
df3 = pd.concat ([df3, pd.Series (bioactivity_class)], axis = 1)

In [62]:
df3 = df3.rename (columns = {0 : 'bioactivity_class'})
df3.head (3)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive


#### **Save Pre-Processed data to a CSV file**

In [57]:
df3.to_csv ('bioactivity_preprocessed_data.csv', index = False)

In [59]:
!dir

 Volume in drive D is Data
 Volume Serial Number is F61E-2FAB

 Directory of D:\CS Projects\Drug-Discovery

10/20/2021  11:50 AM    <DIR>          .
10/19/2021  10:45 PM    <DIR>          ..
10/20/2021  11:50 AM    <DIR>          .ipynb_checkpoints
10/20/2021  10:07 AM            70,468 bioactivity_data.csv
10/20/2021  11:50 AM            10,712 bioactivity_preprocessed_data.csv
10/20/2021  11:50 AM            45,998 Drug_Discovery.ipynb
               3 File(s)        127,178 bytes
               3 Dir(s)  195,598,090,240 bytes free
