# 1 - Extraction of Bioactivity data from Chembl Database and Pre-Processing

In [2]:
# Install chemble_webresource_client to acces the Chembl Database.

#! pip install chembl_webresource_client
#! pip install pandas

Collecting chembl_webresource_client
  Using cached chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
Collecting easydict
  Using cached easydict-1.9-py3-none-any.whl
Collecting requests-cache~=0.7.0
  Using cached requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting url-normalize<2.0,>=1.4
  Using cached url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Collecting itsdangerous>=2.0.1
  Using cached itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Collecting pyyaml>=5.4
  Using cached PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (701 kB)
Installing collected packages: easydict, url-normalize, pyyaml, itsdangerous, requests-cache, chembl_webresource_client
Successfully installed chembl_webresource_client-0.10.8 easydict-1.9 itsdangerous-2.1.2 pyyaml-6.0 requests-cache-0.7.5 url-normalize-1.4.3


In [None]:
# Import the required packages.
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [3]:
# Target search for coronavirus

target = new_client.target
target_query = target.search("coronavirus")
targets = pd.DataFrame.from_dict(target_query)
targets

NameError: name 'new_client' is not defined

In [6]:
# Select SARS coronavirus 3C-like protenase (Single protein) @ index-4 (fifth entry)

selected_target = targets.target_chembl_id[4]
selected_target

'CHEMBL3927'

In [37]:
# Retrieve Bioactivity data of the CHMBL3972 reported as IC50 in nM(nanoMolar)unit.

activity = new_client.activity
res = activity.filter(target_chembl_id = selected_target).filter(standard_type="IC50")

In [38]:
# Convert it into a dataframe

df = pd.DataFrame.from_dict(res)

In [44]:
df.head(3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5


In [45]:
# Now save the Bioactivity in a .csv file with index=False

bioactivity_file = df.to_csv("bioactivity_data.csv", index = False)

In [48]:
# Let's handle our missing data here

df2 = df[df.value.notna()]

In [69]:
df2.molecule_chembl_id

0       CHEMBL187579
1       CHEMBL188487
2       CHEMBL185698
3       CHEMBL426082
4       CHEMBL187717
           ...      
128    CHEMBL2146517
129     CHEMBL187460
130     CHEMBL363535
131     CHEMBL227075
132      CHEMBL45830
Name: molecule_chembl_id, Length: 133, dtype: object

# Pre-processing of the bioactivity Data

In [54]:
#1 Let's distribute the data into active, inactive, and intermediate classes.

bioactivity_class = list()
for i in df2.standard_value:
    if float(i)>= 10000:
        bioactivity_class.append("inactive")
    elif float(i) <= 1000:
        bioactivity_class.append("active")
    else:
        bioactivity_class.append("intermediate")

In [83]:
# Select molecule_chembl_id, canonical_smiles, and standard_value from df2

selection = ["molecule_chembl_id", "canonical_smiles", "standard_value"]
df3 = df2[selection]
pd.DataFrame.assign(df3, bioactivity=bioactivity_class)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate
...,...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,10600.0,inactive
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,10100.0,inactive
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,11500.0,inactive
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,10700.0,inactive


In [85]:
df3.to_csv("bioactivity_preprocessed_data.csv", index=False)