<a href="https://colab.research.google.com/github/Pavalya-Periyasamy05/Machine-Learning-and-AI/blob/main/CDD_Download_Bioactivity_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Computational Drug Discovery: Download Bioactivity Data **

# **ChEMBL Database**

## Installing libraries

In [70]:
! pip install chembl_webresource_client # This library will allow to download the biological activity data directly from the chembl database



## Importing necessary libraries


In [71]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

## **Target search for cancer**

### KRAS (Kirsten Rat Sarcoma Viral Oncogene Homolog) is a small GTPase protein that acts as a molecular switch in cell signaling, controlling cell growth, differentiation, and survival. Mutations in KRAS, such as G12C, are linked to cancers, making it a key target for drug bioactivity studies.

In [72]:
#target = new_client.target
target_query = new_client.target.search("KRAS")
targets = pd.DataFrame.from_dict(target_query) # from_dic tells the  pandas that data isin dictionary format
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,GTPase KRas,19.0,False,CHEMBL2189121,"[{'accession': 'P01116', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Protein cereblon-KRAS,19.0,False,CHEMBL5483196,"[{'accession': 'P01116', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
2,[],Homo sapiens,von Hippel-Lindau disease tumor suppressor/KRAS,18.0,False,CHEMBL5169273,"[{'accession': 'P01116', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
3,[],Homo sapiens,SOS1-KRAS,18.0,False,CHEMBL5465393,"[{'accession': 'Q07889', 'component_descriptio...",PROTEIN COMPLEX,9606
4,[],Homo sapiens,PDE6D/KRAS,17.0,False,CHEMBL4523623,"[{'accession': 'O43924', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
5,[],Homo sapiens,GTPase KRas/RAF1,17.0,False,CHEMBL5291977,"[{'accession': 'P04049', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
6,[],Homo sapiens,RAS,15.0,False,CHEMBL4524006,"[{'accession': 'P01112', 'component_descriptio...",PROTEIN FAMILY,9606


### Single protein target for further investigation

## Retrieve bioactivity data for the cancer-associated GTPase KRAS (first entry)

In [73]:
selected_target = targets.target_chembl_id[0] # assign first entry to the selected_target variable
selected_target

'CHEMBL2189121'

## Filter compounds reported as IC50 values in nM

In [74]:
res = new_client.activity.filter(target_chembl_id = selected_target).filter(standard_type ="IC50") # IC50 standard type filter is to make the data uniform by excluding EC50 and percent activity std types.

In [75]:
df = pd.DataFrame.from_dict(res)

In [76]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,13352855,[],CHEMBL2399318,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,155.0
1,,,13352856,[],CHEMBL2399319,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,342.0
2,,,14548911,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,1.2


In [77]:
# Sanity check to see whether the index needs to be reset.
print(len(df))
df.index.min(), df.index.max()

5760


(0, 5759)

In [78]:
# Sanity check if the data has not onlt IC50 std type
df.standard_type.unique()


array(['IC50'], dtype=object)

### Saving the bioactivity data to a  csv format named **bioactivity_data.csv**

In [79]:
df.to_csv("bioactivity_data.csv", index=False) # False prevents the default Python (pandas) index from being added to the data.

## Copying file to Google Drive

In [80]:
# Mount Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) #Here, force_remount unmounts the Drive if it’s already connected and then remounts it, ensuring any recent changes in Drive are refreshed and accessible.

Mounted at /content/drive


### Shell-based commands

In [81]:
# Creating Data folder in Colab Notebooks on google drive
! mkdir "/content/drive/My Drive/Colab Notebooks/Data" #Exclamation mark is required for running shell-based commands

mkdir: cannot create directory ‘/content/drive/My Drive/Colab Notebooks/Data’: File exists


In [82]:
# Sanity check
! ls  # list the contents of the current opened folder.
! ls "/content/drive/My Drive/Colab Notebooks/"
! ls -l "/content/drive/My Drive/Colab Notebooks/" # dash l shows the time and data at which it is created

bioactivity_data.csv  drive  sample_data
 CDD_Download_Bioactivity_Data.ipynb	       'ML_Classification (1).ipynb'
'Copy of CDD_Download_Bioactivity_Data.ipynb'   ML_Classification.ipynb
 Data					        Untitled0.ipynb
 Exploratory_Data_Analysis.ipynb	        Untitled1.ipynb
 First_Project.ipynb			        Untitled2.ipynb
total 510
-rw------- 1 root root  25977 Dec 27 07:27  CDD_Download_Bioactivity_Data.ipynb
-rw------- 1 root root 110478 Dec 29 13:21 'Copy of CDD_Download_Bioactivity_Data.ipynb'
drwx------ 2 root root   4096 Dec 30 17:50  Data
-rw------- 1 root root   9938 Dec 30 06:27  Exploratory_Data_Analysis.ipynb
-rw------- 1 root root 268271 Dec 25 12:41  First_Project.ipynb
-rw------- 1 root root   1005 Dec 25 18:05 'ML_Classification (1).ipynb'
-rw------- 1 root root  85254 Dec 26 03:47  ML_Classification.ipynb
-rw------- 1 root root    773 May 26  2021  Untitled0.ipynb
-rw------- 1 root root   3231 May 26  2021  Untitled1.ipynb
-rw------- 1 root root  10881 Nov 11  2024  Un

In [83]:
# Copying Data to Drive
! cp bioactivity_data.csv "/content/drive/My Drive/Colab Notebooks/Data"

In [84]:
! head bioactivity_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,13352855,[],CHEMBL2399318,Inhibition of full-length human KRas4B (amino acids 1 to 188)-SOS interaction assessed as inhibition of SOS-mediated nucleotide release activity,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,Outside typical range,"Values for this activity type are u

## **Handling missing data**

### Drop any compounds that has missing value for the standard value column

In [85]:
print(df["standard_value"].isna().sum()) # gives the count for column "standard_type"
print(df["standard_value"].isna().any()) # Boolean: True if column "standard_value" has NaN

22
True


In [86]:
df2 = df[df.standard_value.notna()] # gives filtered dataframe
df2
print(len(df2))
df2.index.min(), df2.index.max()

5738


(np.int64(0), np.int64(5759))

In [87]:
# sanity check fot a specific column
print(df2["standard_value"].isna().sum()) # gives the count for column "standard_type"
print(df2["standard_value"].isna().any()) # Boolean: True if column "standard_value" has NaN

# sanity check across all columns
#print(df2.isna().any().any()) # Returns single true.isna() → marks NaN as True. First .any() → checks column-wise.Second .any() → checks across all columns.
#print(df2.isna().sum()) # gives count for all columns individually.

#Filter columns (prints columns which have NAN)
# print(df2.loc[:,df2.isna().any()]) # When filtering columns, .loc with : is required because df2[..] selects rows by default.

#Filter Rows (prints rows which have NAN)
#print(df2[df2.isna().any(axis=1)]) # rows that contain at least one NaN.

0
False


In [88]:
# Reset the index
df2 = df2.reset_index(drop=True)
# sanity check
print(len(df2))
df2.index.min(), df2.index.max()

5738


(0, 5737)

## Drop any compounds that has missing values for canonical notation

In [89]:
# Sanity check
print(df2["canonical_smiles"].isna().sum()) # gives the count for column "canonical_smiles"
print(df2["canonical_smiles"].isna().any()) # Boolean: True if column "canonical_smiles"  has NaN

8
True


In [90]:
# Filter
df2 = df2[df2.canonical_smiles.notna()] # gives filtered dataframe
df2
print(len(df2))
df2.index.min(), df2.index.max()

5730


(np.int64(0), np.int64(5737))

In [91]:
# Reset the index
df2 = df2.reset_index(drop=True)
# sanity check
print(len(df2))
df2.index.min(), df2.index.max()

5730


(0, 5729)

In [92]:
# Sanity check
print(df2["canonical_smiles"].isna().sum()) # gives the count for column "canonical_smiles"
print(df2["canonical_smiles"].isna().any()) # Boolean: True if column "canonical_smiles" has NaN

0
False


## **Drop any compounds that has missing values for chembl id **

In [94]:
# sanity check
print(df2["molecule_chembl_id"].isna().any())
print(df2["molecule_chembl_id"].isna().sum())

False
0


In [96]:
# Filter
# notna(): isna() can miss missing values when they appear in string/object columns, as missing entries may not always be stored as numeric NaNs. notna() reliably filters out all missing values regardless of data type.
df2 = df2[df2.molecule_chembl_id.notna()] # gives filtered dataframe
df2
print(len(df2))
df2.index.min(), df2.index.max()

5730


(0, 5729)

## Data pre-processing of the bioactivity data

### Labeling compounds as either being active, inactive or intermediate

### The bioactivity data is in IC50 unit. Compounds having values of less than 1000nM will be considered to be **active** while those greater than 10,000 will be considered to be **inactive**. As for those values in between 1000 and 10,000 nM will be referred to be as **intermediate**. (1 micromolar = 1000 nanomolar (nM) )

In [97]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [98]:
len(bioactivity_class) #total items in a list

5730

## DataFrame curation using key X features (molecule_chembl_id, canonical_smiles, and standard_value) and the Y feature (bioactivity_class).

## Iterate the molecule_chembl_id to a list

In [99]:
mol_cid =[]
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

## Iterate canonical_smiles to a list

In [100]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

## Iterate standrad_value to a list

In [101]:
standard_value = []
for i in df.standard_value:
  standard_value.append(i)

## Tuples were used to efficiently collect and align elements from all the generated lists.

In [102]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value)) # Zip, creates tuples row-wise. list, collects all tuples into a list.
df3 = pd.DataFrame(data_tuples, columns =["molecule_chembl_id", "canonical_smiles", "bioactivity_data", "standard_value"])

In [103]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_data,standard_value
0,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,inactive,155000.0
1,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,inactive,342000.0
2,CHEMBL3218635,CC(C)[C@@H]1NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...,intermediate,1200.0
3,CHEMBL3218636,CCCC[C@H]1NC(=O)[C@@H](Cc2ccc3ccccc3c2)NC(=O)[...,inactive,10000.0
4,CHEMBL3218637,CC1(C)COC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CCCNC...,intermediate,1800.0
...,...,...,...,...
5725,CHEMBL5768373,C=CC(=O)N1CC2(CC(n3nc(-c4cccc(=O)n4C)c(-c4c(Cl...,intermediate,536.0
5726,CHEMBL5768373,C=CC(=O)N1CC2(CC(n3nc(-c4cccc(=O)n4C)c(-c4c(Cl...,inactive,50000.0
5727,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,intermediate,335.0
5728,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,inactive,21600.0


## Alternative method

In [106]:
selection = ["molecule_chembl_id", "canonical_smiles", "standard_value"]
df3 = df2[selection]
df3
print(df3.index[:10])
print(pd.Series(bioactivity_class).index[:10])
print(df3.index.is_unique)
print(df3.index.min(), df3.index.max())


RangeIndex(start=0, stop=10, step=1)
RangeIndex(start=0, stop=10, step=1)
True
0 5729


In [107]:
# Rename the class column from 0 to bioactivity_class

df3 = pd.concat([df3, pd.Series(bioactivity_class, name ="bioactivity_class")], axis=1)
df3

# or

#df3 = pd.concat([df3, pd.Series(bioactivity_class)], axis=1) # pd.Series: concat expects in series or DataFrame format
#df3.rename(columns ={0:"bioactivity_class"})


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,155000.0,inactive
1,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,342000.0,inactive
2,CHEMBL3218635,CC(C)[C@@H]1NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...,1200.0,intermediate
3,CHEMBL3218636,CCCC[C@H]1NC(=O)[C@@H](Cc2ccc3ccccc3c2)NC(=O)[...,10000.0,inactive
4,CHEMBL3218637,CC1(C)COC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CCCNC...,1800.0,intermediate
...,...,...,...,...
5725,CHEMBL5768373,C=CC(=O)N1CC2(CC(n3nc(-c4cccc(=O)n4C)c(-c4c(Cl...,5300.0,intermediate
5726,CHEMBL5768373,C=CC(=O)N1CC2(CC(n3nc(-c4cccc(=O)n4C)c(-c4c(Cl...,25300.0,inactive
5727,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,1400.0,intermediate
5728,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,43000.0,inactive


In [108]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,155000.0,inactive
1,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,342000.0,inactive
2,CHEMBL3218635,CC(C)[C@@H]1NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...,1200.0,intermediate
3,CHEMBL3218636,CCCC[C@H]1NC(=O)[C@@H](Cc2ccc3ccccc3c2)NC(=O)[...,10000.0,inactive
4,CHEMBL3218637,CC1(C)COC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CCCNC...,1800.0,intermediate
...,...,...,...,...
5725,CHEMBL5768373,C=CC(=O)N1CC2(CC(n3nc(-c4cccc(=O)n4C)c(-c4c(Cl...,5300.0,intermediate
5726,CHEMBL5768373,C=CC(=O)N1CC2(CC(n3nc(-c4cccc(=O)n4C)c(-c4c(Cl...,25300.0,inactive
5727,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,1400.0,intermediate
5728,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,43000.0,inactive


## Sanity check: Duplicated chembl ID


In [109]:
df3['molecule_chembl_id'].duplicated().any() # Check if there are any duplicate ChEMBL IDs at all

np.True_

In [110]:
df3['molecule_chembl_id'].duplicated().sum() # See how many duplicate entries (extra occurrences) exist, excluding the first occurrence.

np.int64(2979)

In [111]:
df3['molecule_chembl_id'][df3['molecule_chembl_id'].duplicated()].nunique() #How many unique (first-occurrence) ChEMBL IDs appear more than once in the dataset

1708

In [112]:
df3_unique = df3.drop_duplicates(subset=['molecule_chembl_id'], keep='first') # remove all extra occurrences but keep the first occurrence of each ChEMBL ID.


In [113]:
# sanity check
df3_unique['molecule_chembl_id'].duplicated().any()

np.False_

In [114]:
# sanity check
print(df3_unique.index.min(), df3_unique.index.max())
print(len(df3_unique))

# Rest index


0 5716
2751


In [116]:
df4 = df3_unique.reset_index(drop=True)
# or

#df3_unique.reset_index(drop=True, inplace= True) # No need to create a new variable df4
#sanity check
print(df4.index.min(), df4.index.max())
print(len(df4))

0 2750
2751


## Save dataframe to CSV file

In [117]:
df4.to_csv("bioactivity_preprocessed_data.csv", index=False)

In [118]:
! ls -l

total 8796
-rw-r--r-- 1 root root 8618041 Dec 30 18:14 bioactivity_data.csv
-rw-r--r-- 1 root root  374626 Dec 30 18:23 bioactivity_preprocessed_data.csv
drwx------ 6 root root    4096 Dec 30 18:14 drive
drwxr-xr-x 1 root root    4096 Dec 11 14:34 sample_data


## Copy the bioactivity preprocessed data dataframe to Google Drive

In [119]:
! cp bioactivity_preprocessed_data.csv "/content/drive/My Drive/Colab Notebooks/Data"

In [120]:
! ls -l "/content/drive/My Drive/Colab Notebooks/Data"

total 8783
-rw------- 1 root root 8618041 Dec 30 18:15 bioactivity_data.csv
-rw------- 1 root root  374626 Dec 30 18:23 bioactivity_preprocessed_data.csv


###