### **Install ChEMBL client for getting the dataset**

#### **https://www.ebi.ac.uk/chembl/**

In [None]:
!pip install chembl_webresource_client

### **Import Libraries**

In [16]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

### **Find Coronavirus Dataset**

#### **Search Target**

In [17]:
target = new_client.target
target_query = target.search ('acetylcholinesterase')
targets = pd.DataFrame.from_dict (target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P22303', 'xref_name': None, 'xre...",Homo sapiens,Acetylcholinesterase,27.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Cholinesterases; ACHE & BCHE,27.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606
2,[],Drosophila melanogaster,Acetylcholinesterase,17.0,False,CHEMBL2242744,"[{'accession': 'P07140', 'component_descriptio...",SINGLE PROTEIN,7227
3,"[{'xref_id': 'P04058', 'xref_name': None, 'xre...",Torpedo californica,Acetylcholinesterase,15.0,False,CHEMBL4780,"[{'accession': 'P04058', 'component_descriptio...",SINGLE PROTEIN,7787
4,"[{'xref_id': 'P21836', 'xref_name': None, 'xre...",Mus musculus,Acetylcholinesterase,15.0,False,CHEMBL3198,"[{'accession': 'P21836', 'component_descriptio...",SINGLE PROTEIN,10090
5,"[{'xref_id': 'P37136', 'xref_name': None, 'xre...",Rattus norvegicus,Acetylcholinesterase,15.0,False,CHEMBL3199,"[{'accession': 'P37136', 'component_descriptio...",SINGLE PROTEIN,10116
6,"[{'xref_id': 'O42275', 'xref_name': None, 'xre...",Electrophorus electricus,Acetylcholinesterase,15.0,False,CHEMBL4078,"[{'accession': 'O42275', 'component_descriptio...",SINGLE PROTEIN,8005
7,"[{'xref_id': 'P23795', 'xref_name': None, 'xre...",Bos taurus,Acetylcholinesterase,15.0,False,CHEMBL4768,"[{'accession': 'P23795', 'component_descriptio...",SINGLE PROTEIN,9913
8,[],Anopheles gambiae,Acetylcholinesterase,15.0,False,CHEMBL2046266,"[{'accession': 'Q869C3', 'component_descriptio...",SINGLE PROTEIN,7165
9,[],Bemisia tabaci,AChE2,15.0,False,CHEMBL2366409,"[{'accession': 'B3SST5', 'component_descriptio...",SINGLE PROTEIN,7038


#### **Fetch Bio-Activity data for the target**

In [18]:
selected_target = targets.target_chembl_id [0]
selected_target

'CHEMBL220'

In [19]:
activity = new_client.activity
res = activity.filter (target_chembl_id = selected_target).filter (standard_type = "IC50")

#### **A Higher Standard Value means we'll require more amount of the drug for same inhibition**

In [21]:
df = pd.DataFrame.from_dict (res)
df.head (3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0


In [22]:
df.standard_type.unique ()

array(['IC50'], dtype=object)

##### **Save the resulting Bio-Activity data to a CSV file**

In [23]:
import os
df.to_csv (os.path.join ('Datasets', 'Part-1_Bioactivity_Data.csv'), index = False)

### **Pre-Processing Data**

#### **Ignore values with Missing Standard Value data**

In [33]:
df2 = df [df.standard_value.notna ()]
df2 = df2 [df.canonical_smiles.notna ()]
df2

  


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7544,,20703835,[],CHEMBL4627889,Inhibition of AChE (unknown origin) using acet...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.74
7545,,20703856,[],CHEMBL4627888,Inhibition of AChE (unknown origin),B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.51
7546,,20708928,[],CHEMBL4628756,Inhibition of human AchE,A,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,125.0
7547,,20708929,[],CHEMBL4628756,Inhibition of human AchE,A,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,100.0


#### **Label Compounds as active or inactive**
##### Compounds with IC50 less than 1000nM are considered active, greater than 10000nM are considered to be inactive, in between 1000nM to 10000nM are considered intermediate
##### 1. IC50 value of the drug indicates the toxicity of the drug to other disease causing organisms.
##### 2. IC50 is a quantitative measure that shows how much a particular inhibitory drug/substance/extract/fraction is needed to inhibit a biological component by 50%.
###### Above Definition taken from https://www.researchgate.net/post/What-is-the-significance-of-IC50-value-when-the-drug-is-exogenously-administered-to-an-animal-tissue

In [109]:
bioactivity_class = []
for i in df2.standard_value :
    if float (i) >= 10000 :
        bioactivity_class.append ("inactive")
    elif float (i) <= 1000 :
        bioactivity_class.append ("active")
    else :
        bioactivity_class.append ("intermediate")
        
print (len (bioactivity_class))

6340


#### **Append Chembl ID, Canonical Smiles and Standard Value to a list**
##### Canonical Smiles :-
##### 1. Simplified Molecular Input Line Entry Specification
##### 2. They can represent a Molecular Compound in a Single Line

In [110]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2 [selection]
print (len (df3))
df3

6340


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0
...,...,...,...
7544,CHEMBL4645659,COc1ccc(CCC(=O)Nc2nc(-c3cc4ccccc4oc3=O)cs2)cc1OC,740.0
7545,CHEMBL513063,COc1ccc(-c2csc(NC(=O)CCN3CCCC3)n2)cc1,510.0
7546,CHEMBL4640608,COc1cc(C2C3=C(CCCC3=O)NC3=C2C(=O)CCC3)ccc1OCc1...,125000.0
7547,CHEMBL4173961,O=C1CCCC2=C1C(c1ccc(OCc3cccc(F)c3)c(Br)c1)C1=C...,100000.0


In [111]:
import numpy as np

#print (df3.values.shape)
#print (np.array (bioactivity_class).shape)
df4 = df3.values
df4
bioactivity_class = np.matrix (bioactivity_class).T
#bioactivity_class
columns = list (df3.columns)
columns.append ('bioactivity_class')
print (columns)
print (bioactivity_class.shape)
print (df4.shape)
#df3 = pd.concat ([df3, pd.Series (np.array (bioactivity_class))], axis = 1)
#print (len (df3))
#df3

['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'bioactivity_class']
(6340, 1)
(6340, 3)


In [112]:
df4

array([['CHEMBL133897', 'CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1', '750.0'],
       ['CHEMBL336398', 'O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1',
        '100.0'],
       ['CHEMBL131588',
        'CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1', '50000.0'],
       ...,
       ['CHEMBL4640608',
        'COc1cc(C2C3=C(CCCC3=O)NC3=C2C(=O)CCC3)ccc1OCc1ccc(F)cc1',
        '125000.0'],
       ['CHEMBL4173961',
        'O=C1CCCC2=C1C(c1ccc(OCc3cccc(F)c3)c(Br)c1)C1=C(CCCC1=O)N2',
        '100000.0'],
       ['CHEMBL95', 'Nc1c2c(nc3ccccc13)CCCC2', '100.0']], dtype=object)

In [113]:
#df3 = df3.rename (columns = {0 : 'bioactivity_class'})

df_final = np.concatenate ((df4, bioactivity_class), axis = 1)
#df_final = pd.DataFrame (df_final, columns)

df_final
#df3.head (3)
#print (len (df3))

matrix([['CHEMBL133897', 'CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1',
         '750.0', 'active'],
        ['CHEMBL336398', 'O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1',
         '100.0', 'active'],
        ['CHEMBL131588',
         'CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1', '50000.0',
         'inactive'],
        ...,
        ['CHEMBL4640608',
         'COc1cc(C2C3=C(CCCC3=O)NC3=C2C(=O)CCC3)ccc1OCc1ccc(F)cc1',
         '125000.0', 'inactive'],
        ['CHEMBL4173961',
         'O=C1CCCC2=C1C(c1ccc(OCc3cccc(F)c3)c(Br)c1)C1=C(CCCC1=O)N2',
         '100000.0', 'inactive'],
        ['CHEMBL95', 'Nc1c2c(nc3ccccc13)CCCC2', '100.0', 'active']],
       dtype=object)

In [115]:
df_final = pd.DataFrame (df_final, columns = columns)

In [116]:
df_final

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,active
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,active
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,inactive
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,active
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,active
...,...,...,...,...
6335,CHEMBL4645659,COc1ccc(CCC(=O)Nc2nc(-c3cc4ccccc4oc3=O)cs2)cc1OC,740.0,active
6336,CHEMBL513063,COc1ccc(-c2csc(NC(=O)CCN3CCCC3)n2)cc1,510.0,active
6337,CHEMBL4640608,COc1cc(C2C3=C(CCCC3=O)NC3=C2C(=O)CCC3)ccc1OCc1...,125000.0,inactive
6338,CHEMBL4173961,O=C1CCCC2=C1C(c1ccc(OCc3cccc(F)c3)c(Br)c1)C1=C...,100000.0,inactive


#### **Save Pre-Processed data to a CSV file**

In [117]:
df_final.to_csv (os.path.join ('Datasets', 'Part-1_Bioactivity_Preprocessed_Data.csv'), index = False)

In [118]:
!dir

Datasets		     Part-2_Exploratory_Drug_Analysis.ipynb  README.md
LICENSE			     Part-3_Descriptor_Calculation.ipynb
Part-1_Drug_Discovery.ipynb  Part-4_Model_Building.ipynb
