# Data Collection and Exploratory Data Analysis

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Stef0916/chemoinformatics-bioinformatics/blob/main/acetylcholinesterase-QSAR/notebooks/1-Getting-bioactivity-data.ipynb)

## 1. Load Libraries

In [None]:
!pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.8-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests-cache~=0.7.0 (from chembl_webresource_client)
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting attrs<22.0,>=21.2 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize<2.0,>=1.4 (from requests-cache~=0.7.0->chembl_webresource_client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, attrs, requests-cache, chembl_webresource_client
  Attempting uninsta

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from chembl_webresource_client.new_client import new_client

## 2. Search for Target Protein

In [None]:
target = new_client.target
target_query = target.search('acetylcholinesterase')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P22303', 'xref_name': None, 'xre...",Homo sapiens,Acetylcholinesterase,27.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Cholinesterases; ACHE & BCHE,27.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606
2,[],Drosophila melanogaster,Acetylcholinesterase,18.0,False,CHEMBL2242744,"[{'accession': 'P07140', 'component_descriptio...",SINGLE PROTEIN,7227
3,[],Bemisia tabaci,AChE2,16.0,False,CHEMBL2366409,"[{'accession': 'B3SST5', 'component_descriptio...",SINGLE PROTEIN,7038
4,[],Leptinotarsa decemlineata,Acetylcholinesterase,16.0,False,CHEMBL2366490,"[{'accession': 'Q27677', 'component_descriptio...",SINGLE PROTEIN,7539
5,"[{'xref_id': 'P04058', 'xref_name': None, 'xre...",Torpedo californica,Acetylcholinesterase,15.0,False,CHEMBL4780,"[{'accession': 'P04058', 'component_descriptio...",SINGLE PROTEIN,7787
6,"[{'xref_id': 'P21836', 'xref_name': None, 'xre...",Mus musculus,Acetylcholinesterase,15.0,False,CHEMBL3198,"[{'accession': 'P21836', 'component_descriptio...",SINGLE PROTEIN,10090
7,"[{'xref_id': 'P37136', 'xref_name': None, 'xre...",Rattus norvegicus,Acetylcholinesterase,15.0,False,CHEMBL3199,"[{'accession': 'P37136', 'component_descriptio...",SINGLE PROTEIN,10116
8,"[{'xref_id': 'O42275', 'xref_name': None, 'xre...",Electrophorus electricus,Acetylcholinesterase,15.0,False,CHEMBL4078,"[{'accession': 'O42275', 'component_descriptio...",SINGLE PROTEIN,8005
9,"[{'xref_id': 'P23795', 'xref_name': None, 'xre...",Bos taurus,Acetylcholinesterase,15.0,False,CHEMBL4768,"[{'accession': 'P23795', 'component_descriptio...",SINGLE PROTEIN,9913


## 3. Filter “Homo Sapiens” targets

In [None]:
target_hs = targets.loc[targets['organism'] == 'Homo sapiens']
target_hs.sort_values(by='score', ascending=False)

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P22303', 'xref_name': None, 'xre...",Homo sapiens,Acetylcholinesterase,27.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Cholinesterases; ACHE & BCHE,27.0,False,CHEMBL2095233,"[{'accession': 'P06276', 'component_descriptio...",SELECTIVITY GROUP,9606


## 4. Select target

In [None]:
selected_target = target_hs.target_chembl_id.iloc[0]
selected_target

'CHEMBL220'

## 5. Retrieve Bioactivity Data with IC50 as a filter

In [None]:
activity = new_client.activity
bioactivities = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
print(bioactivities)

[{'action_type': None, 'activity_comment': None, 'activity_id': 33969, 'activity_properties': [], 'assay_chembl_id': 'CHEMBL643384', 'assay_description': 'Inhibitory concentration against acetylcholinesterase', 'assay_type': 'B', 'assay_variant_accession': None, 'assay_variant_mutation': None, 'bao_endpoint': 'BAO_0000190', 'bao_format': 'BAO_0000357', 'bao_label': 'single protein format', 'canonical_smiles': 'CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1', 'data_validity_comment': None, 'data_validity_description': None, 'document_chembl_id': 'CHEMBL1148382', 'document_journal': 'J Med Chem', 'document_year': 2004, 'ligand_efficiency': {'bei': '19.61', 'le': '0.36', 'lle': '3.32', 'sei': '9.21'}, 'molecule_chembl_id': 'CHEMBL133897', 'molecule_pref_name': None, 'parent_molecule_chembl_id': 'CHEMBL133897', 'pchembl_value': '6.12', 'potential_duplicate': 0, 'qudt_units': 'http://www.openphacts.org/units/Nanomolar', 'record_id': 252547, 'relation': '=', 'src_id': 1, 'standard_flag': 1, 'standard

In [None]:
bioactivity_df = pd.DataFrame.from_dict(bioactivities)

In [None]:
bioactivity_df.shape

(8832, 46)

In [None]:
bioactivity_df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8


In [None]:
bioactivity_df.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value'],
      dtype='object')

In [None]:
data = bioactivity_df[['canonical_smiles', 'molecule_chembl_id',
                       'standard_type', 'standard_units', 'standard_value']]

In [None]:
data

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
0,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,CHEMBL133897,IC50,nM,750.0
1,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,CHEMBL336398,IC50,nM,100.0
2,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,CHEMBL131588,IC50,nM,50000.0
3,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,CHEMBL130628,IC50,nM,300.0
4,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,CHEMBL130478,IC50,nM,800.0
...,...,...,...,...,...
8827,CC[C@@]1(c2cccc(OC(=O)Nc3ccccc3)c2)CCCCN(C)C1,CHEMBL5219046,IC50,nM,274.0
8828,O=c1[nH]c2ccc(OCc3ccc(F)cc3)cc2c(=O)o1,CHEMBL5219594,IC50,nM,76200.0
8829,CC(C)c1ccc(COc2ccc3[nH]c(=O)oc(=O)c3c2)cc1,CHEMBL5219958,IC50,nM,55000.0
8830,Nc1c2c(nc3ccccc13)CCCC2,CHEMBL95,IC50,nM,1180.0


## 6. Exploratory Data Analysis

### 6.1 Drop Nan SMILES

In [None]:
len(data.loc[data['canonical_smiles'].isnull()])

35

In [None]:
data.loc[data['canonical_smiles'].isnull()]

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
1911,,CHEMBL2448138,IC50,nM,4.04
2028,,CHEMBL2021458,IC50,,
2032,,CHEMBL2021459,IC50,nM,200000.0
3133,,CHEMBL1366,IC50,,
3152,,CHEMBL1476898,IC50,,
3164,,CHEMBL1458880,IC50,,
3320,,CHEMBL306043,IC50,,
3336,,CHEMBL1201469,IC50,,
3351,,CHEMBL1200431,IC50,,
3381,,CHEMBL1909056,IC50,,


In [None]:
data = data.loc[~data['canonical_smiles'].isnull()]
data.shape

(8797, 5)

In [None]:
data.loc[data['canonical_smiles'].isnull()]

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value


### 6.2 Drop Nan standard_value

In [None]:
len(data.loc[data['standard_value'].isnull()])

1250

In [None]:
data = data.loc[~data['standard_value'].isnull()]
data.shape

(7547, 5)

## 6.3 Manage Standard Values

In [None]:
data['standard_units'].unique()

array(['nM', 'ug.mL-1', "10'3pM", "10'6pM", "10'5pM", '10^-4microM', 'µM'],
      dtype=object)

In [None]:
non_nM = data.loc[data['standard_units'] != 'nM']

In [None]:
len(non_nM)

48

In [None]:
non_nM['standard_units'].value_counts()

ug.mL-1        38
10'5pM          3
10'6pM          2
10^-4microM     2
µM              2
10'3pM          1
Name: standard_units, dtype: int64

In [None]:
data = data.loc[data['standard_units'] == 'nM']
data.shape

(7499, 5)

## 6.4 Check for duplicates

In [None]:
duplicates = data['molecule_chembl_id'].value_counts()
duplicates

CHEMBL95         171
CHEMBL502        135
CHEMBL659         73
CHEMBL94          51
CHEMBL636         47
                ... 
CHEMBL1773490      1
CHEMBL1773489      1
CHEMBL1773488      1
CHEMBL1773487      1
CHEMBL5220884      1
Name: molecule_chembl_id, Length: 6125, dtype: int64

Now, I'll sort the data by standard value. The lowest the value, the strongest the activity. This will allow me to use the duplicated() function to drop the less active compounds. The duplicated function only shows from the second occurency of the duplicated values, and leaves outside the first occurrence. With the sorted data, I ensure that only the most potent compound will stay in the dataframe.

In [None]:
data_sorted = data.sort_values(by='standard_value', ascending=True)
data_sorted.head()

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
7994,COc1cc2c(cc1OC)C(=O)/C(=C/c1ccc(OCCCCN[N+]3(C)...,CHEMBL4780352,IC50,nM,0.0
7166,O=C(CCCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2...,CHEMBL199670,IC50,nM,5e-06
7183,O=C(CCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2c...,CHEMBL370807,IC50,nM,5e-06
7182,COc1ccc2[nH]cc(CCNC(=O)CCCCCCNc3c4c(nc5cc(Cl)c...,CHEMBL372202,IC50,nM,5e-06
7181,COc1ccc2[nH]cc(CCNC(=O)CCCCCNc3c4c(nc5cc(Cl)cc...,CHEMBL4468781,IC50,nM,5e-06


In [None]:
data_sorted.loc[data_sorted['molecule_chembl_id'].duplicated()]

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
1818,O=C(CCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCCc...,CHEMBL199585,IC50,nM,0.008
1436,O=C(CCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCCc...,CHEMBL199585,IC50,nM,0.008
1434,COc1ccc2[nH]cc(CCNC(=O)CCCCCCNc3c4c(nc5cc(Cl)c...,CHEMBL372202,IC50,nM,0.04
1830,O=C(CCc1c[nH]c2ccccc12)NCCCCCCCNc1c2c(nc3cc(Cl...,CHEMBL225198,IC50,nM,0.06
1622,O=C(CCc1c[nH]c2ccccc12)NCCCCCCNc1c2c(nc3cc(Cl)...,CHEMBL225567,IC50,nM,0.07
...,...,...,...,...,...
8748,Nc1c2c(nc3ccccc13)CCCC2,CHEMBL95,IC50,nM,99.0
648,c1ccc(CN2CCC(CCc3nsc4ccccc34)CC2)cc1,CHEMBL328102,IC50,nM,99.0
8218,COc1cccc(N2CCN(C(=O)/C=C/c3ccc(O)c(OC)c3)CC2)c1,CHEMBL4785400,IC50,nM,9910.0
8199,CCN(C)C(=O)Oc1cccc([C@H](C)N(C)C)c1,CHEMBL636,IC50,nM,9940.0


In [None]:
data.loc[data['molecule_chembl_id'] == 'CHEMBL199585']

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
1436,O=C(CCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCCc...,CHEMBL199585,IC50,nM,0.008
1818,O=C(CCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCCc...,CHEMBL199585,IC50,nM,0.008
7172,O=C(CCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCCc...,CHEMBL199585,IC50,nM,5e-06


In [None]:
data_sorted = data_sorted.loc[~data_sorted['molecule_chembl_id'].duplicated()]

In [None]:
data_sorted

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
7994,COc1cc2c(cc1OC)C(=O)/C(=C/c1ccc(OCCCCN[N+]3(C)...,CHEMBL4780352,IC50,nM,0.0
7166,O=C(CCCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2...,CHEMBL199670,IC50,nM,0.000005
7183,O=C(CCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2c...,CHEMBL370807,IC50,nM,0.000005
7182,COc1ccc2[nH]cc(CCNC(=O)CCCCCCNc3c4c(nc5cc(Cl)c...,CHEMBL372202,IC50,nM,0.000005
7181,COc1ccc2[nH]cc(CCNC(=O)CCCCCNc3c4c(nc5cc(Cl)cc...,CHEMBL4468781,IC50,nM,0.000005
...,...,...,...,...,...
6318,C[n+]1ccccc1C=NO.[I-],CHEMBL14577,IC50,nM,995700.0
5151,CN(CCCCCCCN1C(=O)c2ccccc2C1=O)Cc1ccccc1.Cl,CHEMBL3402709,IC50,nM,997.0
6565,O=C(NCCCCCCNCc1ccc2ccccc2c1)c1cc(Cl)nc(Cl)c1,CHEMBL4163781,IC50,nM,99700.0
5315,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H](CC[C@@]4...,CHEMBL3586198,IC50,nM,9980.0


In [None]:
data_sorted.reset_index()

Unnamed: 0,index,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
0,7994,COc1cc2c(cc1OC)C(=O)/C(=C/c1ccc(OCCCCN[N+]3(C)...,CHEMBL4780352,IC50,nM,0.0
1,7166,O=C(CCCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2...,CHEMBL199670,IC50,nM,0.000005
2,7183,O=C(CCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2c...,CHEMBL370807,IC50,nM,0.000005
3,7182,COc1ccc2[nH]cc(CCNC(=O)CCCCCCNc3c4c(nc5cc(Cl)c...,CHEMBL372202,IC50,nM,0.000005
4,7181,COc1ccc2[nH]cc(CCNC(=O)CCCCCNc3c4c(nc5cc(Cl)cc...,CHEMBL4468781,IC50,nM,0.000005
...,...,...,...,...,...,...
6120,6318,C[n+]1ccccc1C=NO.[I-],CHEMBL14577,IC50,nM,995700.0
6121,5151,CN(CCCCCCCN1C(=O)c2ccccc2C1=O)Cc1ccccc1.Cl,CHEMBL3402709,IC50,nM,997.0
6122,6565,O=C(NCCCCCCNCc1ccc2ccccc2c1)c1cc(Cl)nc(Cl)c1,CHEMBL4163781,IC50,nM,99700.0
6123,5315,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H](CC[C@@]4...,CHEMBL3586198,IC50,nM,9980.0


In [None]:
data_sorted

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
7994,COc1cc2c(cc1OC)C(=O)/C(=C/c1ccc(OCCCCN[N+]3(C)...,CHEMBL4780352,IC50,nM,0.0
7166,O=C(CCCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2...,CHEMBL199670,IC50,nM,0.000005
7183,O=C(CCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2c...,CHEMBL370807,IC50,nM,0.000005
7182,COc1ccc2[nH]cc(CCNC(=O)CCCCCCNc3c4c(nc5cc(Cl)c...,CHEMBL372202,IC50,nM,0.000005
7181,COc1ccc2[nH]cc(CCNC(=O)CCCCCNc3c4c(nc5cc(Cl)cc...,CHEMBL4468781,IC50,nM,0.000005
...,...,...,...,...,...
6318,C[n+]1ccccc1C=NO.[I-],CHEMBL14577,IC50,nM,995700.0
5151,CN(CCCCCCCN1C(=O)c2ccccc2C1=O)Cc1ccccc1.Cl,CHEMBL3402709,IC50,nM,997.0
6565,O=C(NCCCCCCNCc1ccc2ccccc2c1)c1cc(Cl)nc(Cl)c1,CHEMBL4163781,IC50,nM,99700.0
5315,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H](CC[C@@]4...,CHEMBL3586198,IC50,nM,9980.0


In [None]:
 # Randomize the sorted DataFrame
data_clean = data_sorted.sample(frac=1).reset_index(drop=True)

In [None]:
data_clean

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value
0,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,CHEMBL3234040,IC50,nM,18.2
1,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,CHEMBL4854913,IC50,nM,0.41
2,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,CHEMBL1761995,IC50,nM,80.0
3,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,CHEMBL1200970,IC50,nM,1020000.0
4,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,CHEMBL4760651,IC50,nM,178.0
...,...,...,...,...,...
6120,Clc1cccc2c1CN1CCCCCC1=N2,CHEMBL161880,IC50,nM,1700.0
6121,CCN(C)C(=O)Oc1ccc(-c2cc(=O)c3c(OC(=O)N(C)CC)cc...,CHEMBL4787451,IC50,nM,12300.0
6122,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,CHEMBL4859041,IC50,nM,518.0
6123,CO[C@H]1C=C2[C@H]3CN(Cc4cc5c(cc43)OCO5)[C@H]2C...,CHEMBL5090352,IC50,nM,100000.0


## 7. Label: Active, Inactive, Intermediate

In [None]:
data_clean['standard_value'].dtypes

dtype('O')

In [None]:
data_clean['standard_value'] = data_clean['standard_value'].astype('float64')

In [None]:
data_clean['standard_value'].dtypes

dtype('float64')

In [None]:
bio_class = []

for i in data_clean['standard_value']:
    if i <= 10000:
        bio_class.append('active')
    elif 10000 < i < 20000:
        bio_class.append('intermediate')
    else:
        bio_class.append('inactive')

In [None]:
data_clean['class'] = bio_class

In [None]:
data_clean

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class
0,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,CHEMBL3234040,IC50,nM,18.20,active
1,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,CHEMBL4854913,IC50,nM,0.41,active
2,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,CHEMBL1761995,IC50,nM,80.00,active
3,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,CHEMBL1200970,IC50,nM,1020000.00,inactive
4,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,CHEMBL4760651,IC50,nM,178.00,active
...,...,...,...,...,...,...
6120,Clc1cccc2c1CN1CCCCCC1=N2,CHEMBL161880,IC50,nM,1700.00,active
6121,CCN(C)C(=O)Oc1ccc(-c2cc(=O)c3c(OC(=O)N(C)CC)cc...,CHEMBL4787451,IC50,nM,12300.00,intermediate
6122,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,CHEMBL4859041,IC50,nM,518.00,active
6123,CO[C@H]1C=C2[C@H]3CN(Cc4cc5c(cc43)OCO5)[C@H]2C...,CHEMBL5090352,IC50,nM,100000.00,inactive


In [None]:
data_clean.loc[data_clean['class'] == 'active']

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class
0,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,CHEMBL3234040,IC50,nM,18.20,active
1,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,CHEMBL4854913,IC50,nM,0.41,active
2,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,CHEMBL1761995,IC50,nM,80.00,active
4,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,CHEMBL4760651,IC50,nM,178.00,active
5,COc1cc2c(cc1OC)C(=O)C(Cc1cc[n+](Cc3cccc(Cl)c3)...,CHEMBL4102333,IC50,nM,3.27,active
...,...,...,...,...,...,...
6118,CCN(CC)CCCCCCCOc1ccc2c(c1)C(=O)/C(=C/c1ccc(CN(...,CHEMBL598824,IC50,nM,510.00,active
6119,O=C(CCCNc1c2c(nc3ccccc13)CCCC2)CCCNc1c2c(nc3cc...,CHEMBL238230,IC50,nM,1.83,active
6120,Clc1cccc2c1CN1CCCCCC1=N2,CHEMBL161880,IC50,nM,1700.00,active
6122,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,CHEMBL4859041,IC50,nM,518.00,active


In [None]:
data_clean.loc[data_clean['class'] == 'inactive']

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class
3,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,CHEMBL1200970,IC50,nM,1020000.0,inactive
11,CCN(CC)CCS/C(=N\O)C(=O)c1ccc(Cl)cc1.Cl,CHEMBL544022,IC50,nM,520000.0,inactive
12,OCCN1CCN(c2nccc(NC(c3ccccc3)c3ccccc3)n2)CC1,CHEMBL1834078,IC50,nM,21600.0,inactive
14,CO[C@@H]1C=C[C@@]23c4cc5c(cc4CN(CCc4ccccc4)[C@...,CHEMBL4159837,IC50,nM,200000.0,inactive
26,CC1(C)Oc2cc3oc(=O)ccc3cc2C[C@@H]1O,CHEMBL481657,IC50,nM,28000.0,inactive
...,...,...,...,...,...,...
6107,COc1ccc2c(c1O)CN1CCc3cc4c(cc3C1C2)OCO4,CHEMBL2314746,IC50,nM,100000.0,inactive
6108,O=C(CN1CCN(C(=O)c2ccc(Cl)cc2)CC1)N1CCN(CCCc2c[...,CHEMBL4796328,IC50,nM,22300.0,inactive
6111,CCN(CC)CCS/C(=N\O)c1nc(C(C)(C)C)no1,CHEMBL65891,IC50,nM,390000.0,inactive
6112,COc1ccc2c3c1O[C@H]1C[C@H](O)CC[C@@]31CCN(C)C2,CHEMBL4167958,IC50,nM,57900.0,inactive


In [None]:
data_clean.loc[data_clean['class'] == 'intermediate']

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class
6,O=C(Nc1ccccc1)OC1CCN(C2Cc3ccccc3C2)CC1,CHEMBL4799709,IC50,nM,19200.0,intermediate
13,c1ccc(-c2ccc(NCN3CCCCC3)nc2)cc1,CHEMBL1084542,IC50,nM,13000.0,intermediate
43,COc1cccc(CN(C)CCOc2ccc3c(c2)oc(=O)c2ccccc23)c1,CHEMBL3335057,IC50,nM,18400.0,intermediate
107,COc1cccc(CN2CCN(CCOc3ccc4c(c3)oc(=O)c3ccccc34)...,CHEMBL3335061,IC50,nM,11400.0,intermediate
132,CN(CCCCNc1ccc(-c2ccccc2)nn1)Cc1ccccc1,CHEMBL1084257,IC50,nM,11000.0,intermediate
...,...,...,...,...,...,...
6065,COc1ccc2[nH]cc(CCNC(=O)C3CCN(Cc4ccccc4)CC3)c2c1,CHEMBL3824362,IC50,nM,12220.0,intermediate
6073,CCN(C)C(=O)Oc1ccc(/C=C/C(=O)N2CCN(Cc3ccccc3)CC...,CHEMBL4750360,IC50,nM,14800.0,intermediate
6084,CC(C)=CCC[C@]1(C)[C@@H](CC=C(C)C)C[C@@]2(CC=C(...,CHEMBL4126674,IC50,nM,18550.0,intermediate
6088,COc1cc(/C=C/C(=O)Nc2ccccc2)ccc1OCCCCCCCCCCN1CC...,CHEMBL4074258,IC50,nM,11110.0,intermediate


In [None]:
data_class_active_inactive = data_clean.loc[(data_clean['class'] == 'active') | (data_clean['class'] == 'inactive')]

In [None]:
data_class_active_inactive

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class
0,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,CHEMBL3234040,IC50,nM,18.20,active
1,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,CHEMBL4854913,IC50,nM,0.41,active
2,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,CHEMBL1761995,IC50,nM,80.00,active
3,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,CHEMBL1200970,IC50,nM,1020000.00,inactive
4,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,CHEMBL4760651,IC50,nM,178.00,active
...,...,...,...,...,...,...
6119,O=C(CCCNc1c2c(nc3ccccc13)CCCC2)CCCNc1c2c(nc3cc...,CHEMBL238230,IC50,nM,1.83,active
6120,Clc1cccc2c1CN1CCCCCC1=N2,CHEMBL161880,IC50,nM,1700.00,active
6122,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,CHEMBL4859041,IC50,nM,518.00,active
6123,CO[C@H]1C=C2[C@H]3CN(Cc4cc5c(cc43)OCO5)[C@H]2C...,CHEMBL5090352,IC50,nM,100000.00,inactive


In [None]:
data_class_active_inactive = data_class_active_inactive.copy()
data_class_active_inactive['Label'] = data_class_active_inactive['class'].apply(lambda x: 1 if x == 'active' else 0)

In [None]:
data_class_active_inactive

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class,Label
0,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,CHEMBL3234040,IC50,nM,18.20,active,1
1,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,CHEMBL4854913,IC50,nM,0.41,active,1
2,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,CHEMBL1761995,IC50,nM,80.00,active,1
3,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,CHEMBL1200970,IC50,nM,1020000.00,inactive,0
4,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,CHEMBL4760651,IC50,nM,178.00,active,1
...,...,...,...,...,...,...,...
6119,O=C(CCCNc1c2c(nc3ccccc13)CCCC2)CCCNc1c2c(nc3cc...,CHEMBL238230,IC50,nM,1.83,active,1
6120,Clc1cccc2c1CN1CCCCCC1=N2,CHEMBL161880,IC50,nM,1700.00,active,1
6122,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,CHEMBL4859041,IC50,nM,518.00,active,1
6123,CO[C@H]1C=C2[C@H]3CN(Cc4cc5c(cc43)OCO5)[C@H]2C...,CHEMBL5090352,IC50,nM,100000.00,inactive,0


## 8. IC50 to pIC50

In [None]:
data_class_active_inactive.sort_values(by='standard_value', ascending=True)

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class,Label
4151,COc1cc2c(cc1OC)C(=O)/C(=C/c1ccc(OCCCCN[N+]3(C)...,CHEMBL4780352,IC50,nM,0.000000e+00,active,1
5650,O=C(CCCCCCNc1c2c(nc3cc(Cl)ccc13)CCCC2)NCCc1c[n...,CHEMBL381499,IC50,nM,5.000000e-06,active,1
3250,O=C(CCCCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c...,CHEMBL4556734,IC50,nM,5.000000e-06,active,1
3253,O=C(CCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCCc...,CHEMBL199585,IC50,nM,5.000000e-06,active,1
5686,COc1ccc2[nH]cc(CCNC(=O)CCCCCNc3c4c(nc5ccccc35)...,CHEMBL199454,IC50,nM,5.000000e-06,active,1
...,...,...,...,...,...,...,...
699,CCSC(=O)OCC[N+](C)(C)C.[Cl-],CHEMBL102637,IC50,nM,1.000000e+07,inactive,0
3024,C[N+]1(C)CCO[C@@](O)(c2ccc(C#N)cc2)C1,CHEMBL334938,IC50,nM,1.158777e+07,inactive,0
420,COC(=O)C1=C(Nc2ccc(F)cc2)C[C@@H](c2ccccc2)N(c2...,CHEMBL3597055,IC50,nM,1.512000e+07,inactive,0
4468,O=C(NP(=O)(NN1CCOCC1)NN1CCOCC1)c1ccccc1,CHEMBL1276534,IC50,nM,1.862000e+07,inactive,0


In [None]:
data_class_active_inactive.loc[data_class_active_inactive['standard_value'] == 0]

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class,Label
4151,COc1cc2c(cc1OC)C(=O)/C(=C/c1ccc(OCCCCN[N+]3(C)...,CHEMBL4780352,IC50,nM,0.0,active,1


In [None]:
# Replace 0 with a small value to avoid log(0)
data_class_active_inactive['standard_value'] = data_class_active_inactive['standard_value'].replace(0,5.000000e-06)

In [None]:
pIC50 = []

for value in data_class_active_inactive['standard_value']:
    molar = value*(1**-9)
    pIC50.append(-np.log10(molar))

In [None]:
data_class_active_inactive['pIC50'] = pIC50

In [None]:
data_class_active_inactive.sort_values(by='pIC50', ascending=True)

Unnamed: 0,canonical_smiles,molecule_chembl_id,standard_type,standard_units,standard_value,class,Label,pIC50
5037,CN(CCCCCCCOc1ccc2c(c1)O/C(=C\c1cccc3ccccc13)C2...,CHEMBL371798,IC50,nM,4.960000e+07,inactive,0,-7.695482
4468,O=C(NP(=O)(NN1CCOCC1)NN1CCOCC1)c1ccccc1,CHEMBL1276534,IC50,nM,1.862000e+07,inactive,0,-7.269980
420,COC(=O)C1=C(Nc2ccc(F)cc2)C[C@@H](c2ccccc2)N(c2...,CHEMBL3597055,IC50,nM,1.512000e+07,inactive,0,-7.179552
3024,C[N+]1(C)CCO[C@@](O)(c2ccc(C#N)cc2)C1,CHEMBL334938,IC50,nM,1.158777e+07,inactive,0,-7.064000
699,CCSC(=O)OCC[N+](C)(C)C.[Cl-],CHEMBL102637,IC50,nM,1.000000e+07,inactive,0,-7.000000
...,...,...,...,...,...,...,...,...
2400,O=C(CCCCCCCNc1c2c(nc3cc(Cl)cc(Cl)c13)CCCC2)NCC...,CHEMBL4590945,IC50,nM,5.000000e-06,active,1,5.301030
271,S=C(CCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c2c...,CHEMBL4469239,IC50,nM,5.000000e-06,active,1,5.301030
1883,O=C(CCCCCCCNc1c2c(nc3ccccc13)CCCC2)NCCc1c[nH]c...,CHEMBL4579667,IC50,nM,5.000000e-06,active,1,5.301030
5478,COc1ccc2[nH]cc(CCNC(=O)CCCCNc3c4c(nc5ccccc35)C...,CHEMBL4455677,IC50,nM,5.000000e-06,active,1,5.301030


In [None]:
data_class_active_inactive.drop(['standard_type', 'standard_units', 'standard_value'], axis=1, inplace=True)

In [None]:
data_class_active_inactive

Unnamed: 0,canonical_smiles,molecule_chembl_id,class,Label,pIC50
0,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,CHEMBL3234040,active,1,-1.260071
1,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,CHEMBL4854913,active,1,0.387216
2,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,CHEMBL1761995,active,1,-1.903090
3,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,CHEMBL1200970,inactive,0,-6.008600
4,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,CHEMBL4760651,active,1,-2.250420
...,...,...,...,...,...
6119,O=C(CCCNc1c2c(nc3ccccc13)CCCC2)CCCNc1c2c(nc3cc...,CHEMBL238230,active,1,-0.262451
6120,Clc1cccc2c1CN1CCCCCC1=N2,CHEMBL161880,active,1,-3.230449
6122,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,CHEMBL4859041,active,1,-2.714330
6123,CO[C@H]1C=C2[C@H]3CN(Cc4cc5c(cc43)OCO5)[C@H]2C...,CHEMBL5090352,inactive,0,-5.000000


In [None]:
data_class_active_inactive.set_index('molecule_chembl_id', inplace=True)

In [None]:
data_class_active_inactive

Unnamed: 0_level_0,canonical_smiles,class,Label,pIC50
molecule_chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CHEMBL3234040,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCc4ccc(CNC(=O)c5cc(O...,active,1,-1.260071
CHEMBL4854913,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,active,1,0.387216
CHEMBL1761995,[Br-].[Br-].c1ccc2c(c1)ccc[n+]2CCCCCCCC[n+]1cc...,active,1,-1.903090
CHEMBL1200970,CCN(CC)C(C)CN1c2ccccc2Sc2ccccc21.Cl,inactive,0,-6.008600
CHEMBL4760651,Clc1ccc(CNC2CCN(Cc3ccccc3)C2)cc1,active,1,-2.250420
...,...,...,...,...
CHEMBL238230,O=C(CCCNc1c2c(nc3ccccc13)CCCC2)CCCNc1c2c(nc3cc...,active,1,-0.262451
CHEMBL161880,Clc1cccc2c1CN1CCCCCC1=N2,active,1,-3.230449
CHEMBL4859041,O=C(CC1CCN(Cc2ccc(F)cc2)CC1)Nc1n[nH]c2cc(-c3cc...,active,1,-2.714330
CHEMBL5090352,CO[C@H]1C=C2[C@H]3CN(Cc4cc5c(cc43)OCO5)[C@H]2C...,inactive,0,-5.000000


## 9. Save Data

In [None]:
data_class_active_inactive.to_csv('data_class_active_inactive.csv', index=False)

In [None]:
data_class_active_inactive['canonical_smiles'].to_csv('data_labelled.smi', header=False, sep='\t')