# ChEMBL DATA ACQUISITION

## Import Libraries

In [1]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

### **Target search for P2X4 Protein**

In [2]:
# Target search for P2X7 protein
target = new_client.target
target_query = target.search('p2x4')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Mus musculus,P2X purinoceptor 4,20.0,False,CHEMBL2176849,"[{'accession': 'Q9JJX6', 'component_descriptio...",SINGLE PROTEIN,10090
1,"[{'xref_id': 'Q99571', 'xref_name': None, 'xre...",Homo sapiens,P2X purinoceptor 4,16.0,False,CHEMBL2104,"[{'accession': 'Q99571', 'component_descriptio...",SINGLE PROTEIN,9606
2,"[{'xref_id': 'P51577', 'xref_name': None, 'xre...",Rattus norvegicus,P2X purinoceptor 4,16.0,False,CHEMBL2818,"[{'accession': 'P51577', 'component_descriptio...",SINGLE PROTEIN,10116
3,[],Homo sapiens,P2X receptor,7.0,False,CHEMBL4524012,"[{'accession': 'P51575', 'component_descriptio...",PROTEIN FAMILY,9606
4,[],Rattus norvegicus,P2X receptor,7.0,False,CHEMBL4524013,"[{'accession': 'P51579', 'component_descriptio...",PROTEIN FAMILY,10116


### **Select and retrieve bioactivity data for Human *P2X purinoceptor 4 protein* (second entry)**

We will assign the second entry (which corresponds to the target protein, *P2X purinoceptor 4*) to the ***selected_target*** variable 

In [3]:
selected_target = targets.target_chembl_id[1]
selected_target

'CHEMBL2104'

Here, we will retrieve only bioactivity data for *P2X purinoceptor 4* (CHEMBL2104) that are reported as IC$_{50}$ values in nM (nanomolar) unit.

In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [5]:
df = pd.DataFrame.from_dict(res)

In [6]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,39959,[],CHEMBL750984,The compound was evaluated for antagonist acti...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,100000.0
1,,,1075404,[],CHEMBL751124,Inhibition of inward ion current elicited by A...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,27.5
2,,,1610564,[],CHEMBL872178,Inhibitory concentration against human P2X pur...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,Log IC50,,UO_0000065,,-6.0
3,,,2271933,[],CHEMBL1017979,Antagonist activity at P2X4 receptor up to 10 uM,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,10.0
4,,,2272037,[],CHEMBL960840,Inhibition of P2X4 receptor,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,"{'action_type': 'ANTAGONIST', 'description': '...",,24821345,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5140232,Antagonist activity against human P2X4R stably...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,0.904
308,"{'action_type': 'INHIBITOR', 'description': 'N...",,24953699,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210088,Affinity Phenotypic Cellular interaction (Ephy...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,274.0
309,"{'action_type': 'INHIBITOR', 'description': 'N...",,24953937,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210283,Affinity On-target Cellular interaction (FLIPR...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,211.0
310,,,24953938,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210284,Affinity On-target Cellular interaction (FLIPR...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,25000.0


Finally we will save the resulting bioactivity data to a CSV file **p2x4_chembl_rawdata.csv**.

In [7]:
df['standard_units'].unique()

array(['nM', None, 'µM'], dtype=object)

### saving files

In [8]:
df.to_csv('data/chembl/p2x4_chembl_rawdata.csv', index=False)

## Handling missing data
If any compounds has missing value for the **standard_value** and **canonical_smiles** column then drop it

In [9]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2

  df2 = df2[df.canonical_smiles.notna()]


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,39959,[],CHEMBL750984,The compound was evaluated for antagonist acti...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,100000.0
1,,,1075404,[],CHEMBL751124,Inhibition of inward ion current elicited by A...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,27.5
2,,,1610564,[],CHEMBL872178,Inhibitory concentration against human P2X pur...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,Log IC50,,UO_0000065,,-6.0
3,,,2271933,[],CHEMBL1017979,Antagonist activity at P2X4 receptor up to 10 uM,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,10.0
4,,,2272037,[],CHEMBL960840,Inhibition of P2X4 receptor,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,"{'action_type': 'ANTAGONIST', 'description': '...",,24821345,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5140232,Antagonist activity against human P2X4R stably...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,0.904
308,"{'action_type': 'INHIBITOR', 'description': 'N...",,24953699,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210088,Affinity Phenotypic Cellular interaction (Ephy...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,274.0
309,"{'action_type': 'INHIBITOR', 'description': 'N...",,24953937,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210283,Affinity On-target Cellular interaction (FLIPR...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,211.0
310,,,24953938,"[{'comments': None, 'relation': None, 'result_...",CHEMBL5210284,Affinity On-target Cellular interaction (FLIPR...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,25000.0


## Handling duplicates

In [10]:
len(df2.canonical_smiles.unique())

258

In [11]:
df2['standard_units'].unique()

array(['nM', 'µM'], dtype=object)

If any compounds has duplicates for the **canonical_smiles** column then drop it

In [12]:
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,39959,[],CHEMBL750984,The compound was evaluated for antagonist acti...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,nM,UO_0000065,,100000.0
1,,,1075404,[],CHEMBL751124,Inhibition of inward ion current elicited by A...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,27.5
2,,,1610564,[],CHEMBL872178,Inhibitory concentration against human P2X pur...,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,Log IC50,,UO_0000065,,-6.0
3,,,2271933,[],CHEMBL1017979,Antagonist activity at P2X4 receptor up to 10 uM,F,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,10.0
4,,,2272037,[],CHEMBL960840,Inhibition of P2X4 receptor,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,"{'action_type': 'ANTAGONIST', 'description': '...",,24821338,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5140232,Antagonist activity against human P2X4R stably...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,14.26
301,"{'action_type': 'ANTAGONIST', 'description': '...",,24821339,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5140232,Antagonist activity against human P2X4R stably...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,14.26
302,"{'action_type': 'ANTAGONIST', 'description': '...",,24821340,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5140232,Antagonist activity against human P2X4R stably...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,0.039
303,"{'action_type': 'ANTAGONIST', 'description': '...",,24821341,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5140232,Antagonist activity against human P2X4R stably...,B,,,BAO_0000190,...,Homo sapiens,P2X purinoceptor 4,9606,,,IC50,uM,UO_0000065,,0.935


In [13]:
df2_nr['standard_units'].unique()

array(['nM'], dtype=object)

## Data pre-processing of the bioactivity data

### Combine the 3 columns (molecule_chembl_id,canonical_smiles,standard_value) and bioactivity_class into a DataFrame

In [14]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL216504,O=C(Nc1cccc(C(=O)Nc2ccc(S(=O)(=O)[O-])c3cc(S(=...,100000.0
1,CHEMBL69234,Cc1nc(/N=N/c2ccc(S(=O)(=O)O)cc2S(=O)(=O)O)c(CO...,27500.0
2,CHEMBL413145,O=C(Nc1cccc(C(=O)Nc2cc(C(=O)Nc3ccc(S(=O)(=O)[O...,1000.0
3,CHEMBL494161,COc1cc(C(C)C)c(Oc2cnc(NC(CO)CO)nc2N)cc1I,10000.0
4,CHEMBL526307,COc1cc(C(C)C)c(Oc2cnc(N)nc2N)cc1I,10000.0
...,...,...,...
300,CHEMBL5177180,O=C(NC(=S)Nc1ccc(Br)cc1)c1ccc2c(c1)OCO2,14260.0
301,CHEMBL5191639,Cc1cc(Br)ccc1NC(=S)NC(=O)c1ccc2c(c1)OCO2,14260.0
302,CHEMBL5206892,CC(C)c1ccc(NC(=S)NC(=O)c2ccc3c(c2)OCO3)c(Br)c1,39.0
303,CHEMBL5205239,O=C(NC(=S)Nc1cccnc1Cl)c1ccc2c(c1)OCO2,935.0


Save dataframe to CSV file

### saving files

In [15]:
df3.to_csv('data/chembl/p2x4_chembl_data.csv', index=False)