In [6]:
! pip install chembl_webresource_client



In [7]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for malarial drug repurposing**

In [9]:
# Target search for malarial drug repurposing
target = new_client.target
target_query = target.search('falcipain')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Plasmodium falciparum,Falcipain 2,20.0,False,CHEMBL1697672,"[{'accession': 'Q9NAW2', 'component_descriptio...",SINGLE PROTEIN,5833
1,[],Plasmodium falciparum,Cysteine protease falcipain-2,19.0,False,CHEMBL5801,"[{'accession': 'Q9NBD4', 'component_descriptio...",SINGLE PROTEIN,5833
2,[],Plasmodium falciparum 3D7,Falcipain-3,19.0,False,CHEMBL1250373,"[{'accession': 'Q8IIL0', 'component_descriptio...",SINGLE PROTEIN,36329
3,[],Plasmodium falciparum,Cysteine protease falcipain-3,18.0,False,CHEMBL4510,"[{'accession': 'Q9NBA7', 'component_descriptio...",SINGLE PROTEIN,5833
4,[],Plasmodium falciparum,Falcipain 2B,18.0,False,CHEMBL3488,"[{'accession': 'Q3HTL5', 'component_descriptio...",SINGLE PROTEIN,5833
5,[],Plasmodium falciparum,Falcipain 2,18.0,False,CHEMBL5800,"[{'accession': 'Q9N6S8', 'component_descriptio...",SINGLE PROTEIN,5833
6,[],Plasmodium falciparum,Cysteine protease falcipain-3,18.0,False,CHEMBL1697661,"[{'accession': 'Q9NAW4', 'component_descriptio...",SINGLE PROTEIN,5833
7,[],Plasmodium falciparum 3D7,Cysteine proteinase falcipain-1,17.0,False,CHEMBL1250371,"[{'accession': 'Q8I6V0', 'component_descriptio...",SINGLE PROTEIN,36329


### **Select and retrieve bioactivity data for *falcipain 2* (sixth entry)**

We will assign the fifth entry (which corresponds to the target protein, *falcipain 2* ) to the ***selected_target*** variable

In [10]:
selected_target = targets.target_chembl_id[5]
selected_target

'CHEMBL5800'

Here, we will retrieve only bioactivity data for *falcipain 2* (CHEMBL5800) that are reported as IC$_{50}$ values in nM (nanomolar) unit.

In [11]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [12]:
df = pd.DataFrame.from_dict(res)

In [15]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,Not Active,553640,[],CHEMBL682030,Inhibitory activity against Falcipain-2; no ef...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,
1,,Not Active,554837,[],CHEMBL682029,Inhibitory activity against Falcipain-2; No ef...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,
2,,Not Active,554840,[],CHEMBL682029,Inhibitory activity against Falcipain-2; No ef...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,
3,,Not Active,555983,[],CHEMBL682030,Inhibitory activity against Falcipain-2; no ef...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,
4,,Not Active,555986,[],CHEMBL682030,Inhibitory activity against Falcipain-2; no ef...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139210,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,7.1
468,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139211,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,10.4
469,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139212,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,5.4
470,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139213,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,4.6


In [16]:
df.standard_type.unique()

array(['IC50'], dtype=object)

Finally we will save the resulting bioactivity data to a CSV file **bioactivity_data.csv**.

In [17]:
df.to_csv('bioactivity_data.csv', index=False)

In [19]:
df[["action_type", "assay_chembl_id", "standard_type", "standard_value"]]

Unnamed: 0,action_type,assay_chembl_id,standard_type,standard_value
0,,CHEMBL682030,IC50,
1,,CHEMBL682029,IC50,
2,,CHEMBL682029,IC50,
3,,CHEMBL682030,IC50,
4,,CHEMBL682030,IC50,
...,...,...,...,...
467,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5622194,IC50,7100.0
468,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5622194,IC50,10400.0
469,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5622194,IC50,5400.0
470,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5622194,IC50,4600.0


In [28]:
df = df.dropna(subset=["standard_value"])
df["standard_value"] = pd.to_numeric(df["standard_value"])

lowest3 = df.nsmallest(450, "standard_value")[[
    "action_type",
    "assay_chembl_id",
    "standard_type",
    "standard_value"
]]

lowest3


Unnamed: 0,action_type,assay_chembl_id,standard_type,standard_value
446,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5326422,IC50,0.36
448,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5326422,IC50,0.42
363,,CHEMBL4223220,IC50,0.60
339,,CHEMBL4223220,IC50,1.20
187,,CHEMBL945401,IC50,2.00
...,...,...,...,...
198,,CHEMBL1074166,IC50,100000.00
197,,CHEMBL1074166,IC50,100000.00
406,,CHEMBL5326394,IC50,100000.00
413,"{'action_type': 'INHIBITOR', 'description': 'N...",CHEMBL5326396,IC50,107320.00


In [29]:
df.to_csv('ascending_standard_values.csv', index=False)

## **Copying files to Google Drive**

Firstly, we need to mount the Google Drive into Colab so that we can have access to our Google adrive from within Colab.

In [30]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)


Mounted at /content/gdrive/


Next, we create a **data** folder in our **Colab Notebooks** folder on Google Drive.

In [31]:
! mkdir "/content/gdrive/My Drive/Colab Notebooks/data2"

In [34]:
! cp bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/data2"

In [35]:
! cp ascending_standard_values.csv "/content/gdrive/My Drive/Colab Notebooks/data2"

In [36]:
! ls -l "/content/gdrive/My Drive/Colab Notebooks/data2"

total 519
-rw------- 1 root root 257070 Dec  2 08:17 ascending_standard_values.csv
-rw------- 1 root root 273141 Dec  2 08:17 bioactivity_data.csv


Let's see the CSV files that we have so far.

In [40]:
! ls

ascending_standard_values.csv  bioactivity_data.csv  gdrive


Taking a glimpse of the **bioactivity_data.csv** file that we've just created.

In [41]:
! head bioactivity_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,Not Active,553640,[],CHEMBL682030,Inhibitory activity against Falcipain-2; no effect,B,,,BAO_0000190,BAO_0000357,single protein format,O=C1C(=O)N(C(=O)OCc2ccccc2)c2ccccc21,,,CHEMBL1145878,Bioorg Med Chem Lett,2003,,CHEMBL119515,,CHEMBL119515,,0,http://www.openphacts.org/units/Nanomolar,223841,,1,0,,,IC50,nM,

## **Handling missing data**
If any compounds has missing value for the **standard_value** column then drop it

In [42]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
5,,,557287,[],CHEMBL682028,Inhibitory activity against Falcipain-2,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,43.9
11,,,559694,[],CHEMBL682028,Inhibitory activity against Falcipain-2,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,4.4
14,,,565365,[],CHEMBL682028,Inhibitory activity against Falcipain-2,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,34.6
16,,,569132,[],CHEMBL682028,Inhibitory activity against Falcipain-2,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,31.2
17,,,570441,[],CHEMBL682028,Inhibitory activity against Falcipain-2,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,13.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139210,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,7.1
468,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139211,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,10.4
469,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139212,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,5.4
470,"{'action_type': 'INHIBITOR', 'description': 'N...",,26139213,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5622194,Inhibition of Plasmodium falciparum FP2 using ...,B,,,BAO_0000190,...,Plasmodium falciparum,Falcipain 2,5833,,,IC50,uM,UO_0000065,,4.6


Apparently, for this dataset there is no missing data. But we can use the above code cell for bioactivity data of other target protein.

## **Data pre-processing of the bioactivity data**

### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [43]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

### **Iterate the *molecule_chembl_id* to a list**

In [44]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

### **Iterate *canonical_smiles* to a list**

In [45]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

### **Iterate *standard_value* to a list**

In [46]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

### **Combine the 4 lists into a dataframe**

In [47]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

In [48]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL119345,Cc1ccc2[nH]c(O)c(/N=N/C(=N)S)c2c1,inactive,43900.0
1,CHEMBL120254,NC(=S)/N=N/c1c(O)[nH]c2ccc([N+](=O)[O-])cc12,intermediate,4400.0
2,CHEMBL120076,COc1cccc(CN2C(=O)C(=O)c3cc(Cl)ccc32)c1,inactive,34600.0
3,CHEMBL333067,O=C1C(=O)N(Cc2ccc(Cl)cc2)c2ccc(Cl)cc21,inactive,31200.0
4,CHEMBL118210,Cc1cc(C)c2[nH]c(O)c(/N=N/C(=N)S)c2c1,inactive,13200.0
...,...,...,...,...
427,CHEMBL5624786,COc1ccc(C(c2cccc(OC)c2O)N2CCN(c3ccnc4cc(Cl)ccc...,intermediate,7100.0
428,CHEMBL5630854,COc1cccc(C(c2ccc(F)c(F)c2)N2CCN(c3ccnc4cc(Cl)c...,inactive,10400.0
429,CHEMBL5630121,COc1cccc(C(c2ccc(C(F)(F)F)cc2)N2CCN(c3ccnc4cc(...,intermediate,5400.0
430,CHEMBL5624599,COc1cccc(C(c2ccc(Cl)c(Cl)c2)N2CCN(c3ccnc4cc(Cl...,intermediate,4600.0


### **Alternative method**

In [49]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
5,CHEMBL119345,Cc1ccc2[nH]c(O)c(/N=N/C(=N)S)c2c1,43900.0
11,CHEMBL120254,NC(=S)/N=N/c1c(O)[nH]c2ccc([N+](=O)[O-])cc12,4400.0
14,CHEMBL120076,COc1cccc(CN2C(=O)C(=O)c3cc(Cl)ccc32)c1,34600.0
16,CHEMBL333067,O=C1C(=O)N(Cc2ccc(Cl)cc2)c2ccc(Cl)cc21,31200.0
17,CHEMBL118210,Cc1cc(C)c2[nH]c(O)c(/N=N/C(=N)S)c2c1,13200.0
...,...,...,...
467,CHEMBL5624786,COc1ccc(C(c2cccc(OC)c2O)N2CCN(c3ccnc4cc(Cl)ccc...,7100.0
468,CHEMBL5630854,COc1cccc(C(c2ccc(F)c(F)c2)N2CCN(c3ccnc4cc(Cl)c...,10400.0
469,CHEMBL5630121,COc1cccc(C(c2ccc(C(F)(F)F)cc2)N2CCN(c3ccnc4cc(...,5400.0
470,CHEMBL5624599,COc1cccc(C(c2ccc(Cl)c(Cl)c2)N2CCN(c3ccnc4cc(Cl...,4600.0


In [50]:
pd.concat([df3,pd.Series(bioactivity_class)], axis=1)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,0
5,CHEMBL119345,Cc1ccc2[nH]c(O)c(/N=N/C(=N)S)c2c1,43900.0,inactive
11,CHEMBL120254,NC(=S)/N=N/c1c(O)[nH]c2ccc([N+](=O)[O-])cc12,4400.0,intermediate
14,CHEMBL120076,COc1cccc(CN2C(=O)C(=O)c3cc(Cl)ccc32)c1,34600.0,inactive
16,CHEMBL333067,O=C1C(=O)N(Cc2ccc(Cl)cc2)c2ccc(Cl)cc21,31200.0,inactive
17,CHEMBL118210,Cc1cc(C)c2[nH]c(O)c(/N=N/C(=N)S)c2c1,13200.0,inactive
...,...,...,...,...
265,,,,active
269,,,,intermediate
275,,,,intermediate
368,,,,inactive


Saves dataframe to CSV file

In [51]:
df3.to_csv('bioactivity_preprocessed_data.csv', index=False)

In [53]:
! ls -l

total 560
-rw-r--r-- 1 root root 257070 Dec  2 08:11 ascending_standard_values.csv
-rw-r--r-- 1 root root 273141 Dec  2 07:51 bioactivity_data.csv
-rw-r--r-- 1 root root  36325 Dec  2 08:23 bioactivity_preprocessed_data.csv
drwx------ 5 root root   4096 Dec  2 08:12 gdrive


Let's copy to the Google Drive

In [54]:
! cp bioactivity_preprocessed_data.csv "/content/gdrive/My Drive/Colab Notebooks/data2"

cp: cannot create regular file '/content/gdrive/My Drive/Colab Notebooks/data2': No such file or directory


In [55]:
! ls "/content/gdrive/My Drive/Colab Notebooks/data"

ls: cannot access '/content/gdrive/My Drive/Colab Notebooks/data': No such file or directory


---