In [9]:
# ! pip install chembl_webresource_client

In [194]:
import pandas as pd
import matplotlib.pyplot as plt
from chembl_webresource_client.new_client import new_client

In [None]:
targets_api = new_client.target
compounds_api = new_client.molecule
bioactivities_api = new_client.activity

In [133]:
###############
# Find Enzyme #
###############

# mTOR Uniprot at https://www.uniprot.org/uniprotkb/P42345/entry
uniprot_id = "P42345"

# Find enzyme in chembl database

target_query = targets_api.get(target_components__accession=uniprot_id).only(
    "target_chembl_id", "organism", "pref_name", "target_type"
)

targets = pd.DataFrame.from_dict(target_query)

targets

Unnamed: 0,organism,pref_name,target_chembl_id,target_type
0,Homo sapiens,Serine/threonine-protein kinase mTOR,CHEMBL2842,SINGLE PROTEIN
1,Homo sapiens,FKBP12A/mTOR,CHEMBL2221341,PROTEIN COMPLEX
2,Homo sapiens,mTORC1,CHEMBL4296661,PROTEIN COMPLEX
3,Homo sapiens,mTOR/FKBP12A/FKBP12B,CHEMBL4296662,PROTEIN COMPLEX
4,Homo sapiens,DEPTOR/mTOR,CHEMBL4523674,PROTEIN-PROTEIN INTERACTION
5,Homo sapiens,mTORC2,CHEMBL4523999,PROTEIN COMPLEX


In [136]:
# Select te first entry

selected_target_id = targets.target_chembl_id[0]
selected_target_id

'CHEMBL2842'

In [207]:
##########################
# Search for Bioactivity #
##########################

# Get the enzymes that inhibit mTOR -> IC50 bioactivity of the enzyme
# Parameters: return only exact measurements (relation '=') & binding data (assay type 'B')

bioactivity = bioactivities_api.filter(
    target_chembl_id=selected_target_id, type="IC50", relation="=", assay_type="B"
).only(
    "activity_id",
    "assay_chembl_id",
    "assay_description",
    "assay_type",
    "molecule_chembl_id",
    "canonical_smiles",
    "type",
    "standard_units",
    "relation",
    "standard_value",
    "standard_units",
    "target_chembl_id",
    "target_organism",
)

bioactivity_df = pd.DataFrame.from_dict(bioactivity)

In [208]:
bioactivity_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,canonical_smiles,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,1410291,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,CC1CN(c2cc(=O)c3ccc4ccccc4c3o2)CCO1,CHEMBL435507,=,nM,4800.0,CHEMBL2842,Homo sapiens,IC50,uM,4.8
1,1412283,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12,CHEMBL98350,=,nM,2500.0,CHEMBL2842,Homo sapiens,IC50,uM,2.5
2,1412288,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12,CHEMBL104468,=,nM,6400.0,CHEMBL2842,Homo sapiens,IC50,uM,6.4
3,1412303,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,O=c1cc(N2CCOCC2)nc2c3ccccc3ccn12,CHEMBL179242,=,nM,5300.0,CHEMBL2842,Homo sapiens,IC50,uM,5.3
4,1459738,CHEMBL830270,Inhibition of Mammalian target of Rapamycin mTOR,B,O=c1cc(N2CCOCC2)oc2c(-c3cccc4c3sc3ccccc34)cccc12,CHEMBL188678,=,nM,1700.0,CHEMBL2842,Homo sapiens,IC50,uM,1.7


In [209]:
######################
# Data Preprocessing #
######################

# Convert datatype of “standard_value” from “object” to “float”
# Done to make these values usable in calculations later on

bioactivity_df = bioactivity_df.astype({"standard_value": "float64"})

# Remove missing values in canonical_smiles and standard vaue -> columns that we need for prediction later
df = df.dropna(subset=['canonical_smiles', 'standard_value'])

# Keep only entries where the standard units are in nM
# Need to keep units consistent

bioactivities_df = bioactivity_df[bioactivity_df["standard_units"] == "nM"]

# Delete Duplicates

bioactivity_df = bioactivity_df.drop_duplicates("molecule_chembl_id", keep="first")
bioactivity_df = bioactivity_df.drop_duplicates(['canonical_smiles'], keep="first")


bioactivity_df.reset_index(drop=True, inplace=True)

In [210]:
bioactivity_df.head()

Unnamed: 0,activity_id,assay_chembl_id,assay_description,assay_type,canonical_smiles,molecule_chembl_id,relation,standard_units,standard_value,target_chembl_id,target_organism,type,units,value
0,1410291,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,CC1CN(c2cc(=O)c3ccc4ccccc4c3o2)CCO1,CHEMBL435507,=,nM,4800.0,CHEMBL2842,Homo sapiens,IC50,uM,4.8
1,1412283,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12,CHEMBL98350,=,nM,2500.0,CHEMBL2842,Homo sapiens,IC50,uM,2.5
2,1412288,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12,CHEMBL104468,=,nM,6400.0,CHEMBL2842,Homo sapiens,IC50,uM,6.4
3,1412303,CHEMBL830918,Inhibition of mTOR protein isolated from HeLa ...,B,O=c1cc(N2CCOCC2)nc2c3ccccc3ccn12,CHEMBL179242,=,nM,5300.0,CHEMBL2842,Homo sapiens,IC50,uM,5.3
4,1459738,CHEMBL830270,Inhibition of Mammalian target of Rapamycin mTOR,B,O=c1cc(N2CCOCC2)oc2c(-c3cccc4c3sc3ccccc34)cccc12,CHEMBL188678,=,nM,1700.0,CHEMBL2842,Homo sapiens,IC50,uM,1.7


In [211]:
# Select only the columns we need

selection = ['molecule_chembl_id','canonical_smiles','standard_value']
bioactivity_df = bioactivity_df[selection]

In [212]:
################################
# Calculate active or inactive #
################################

# Does the molecule actually inhibit mTOR?
# If IC50 is >= 10000, it doesn't really inhibit

bioactivity_threshold = []

for i in bioactivity_df.standard_value:
    if float(i) >= 10000:
        bioactivity_threshold.append("inactive")
    elif float(i) <= 1000:
        bioactivity_threshold.append("active")
    else:
        bioactivity_threshold.append("intermediate")

bioactivity_class = pd.Series(bioactivity_threshold, name='class')
bioactivity_df = pd.concat([bioactivity_df, bioactivity_class], axis=1)

In [213]:
bioactivity_df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL435507,CC1CN(c2cc(=O)c3ccc4ccccc4c3o2)CCO1,4800.00,intermediate
1,CHEMBL98350,O=c1cc(N2CCOCC2)oc2c(-c3ccccc3)cccc12,2500.00,intermediate
2,CHEMBL104468,O=c1cc(N2CCOCC2)oc2c1ccc1ccccc12,6400.00,intermediate
3,CHEMBL179242,O=c1cc(N2CCOCC2)nc2c3ccccc3ccn12,5300.00,intermediate
4,CHEMBL188678,O=c1cc(N2CCOCC2)oc2c(-c3cccc4c3sc3ccccc34)cccc12,1700.00,intermediate
...,...,...,...,...
3915,CHEMBL76,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,270.00,active
3916,CHEMBL5219124,O=C(NOCCOCCOCCOCCC(=O)N1CCN(c2cnc3ccc(-c4cnc(C...,53.10,active
3917,CHEMBL5220248,O=S(=O)(Nc1cc(-c2ccc3ncc(N4CCNCC4)nc3c2)cnc1Cl...,5.97,active
3918,CHEMBL5220911,CC(=O)N1CCN(c2cnc3ccc(-c4cnc(Cl)c(NS(=O)(=O)c5...,3.13,active


In [231]:
bioactivity_df.to_csv('mTOR_bioactivity.csv', index=False)