<a href="https://colab.research.google.com/github/Pavalya-Periyasamy05/Machine-Learning-and-AI/blob/main/CDD_Download_Bioactivity_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Computational Drug Discovery: Download Bioactivity Data **

# **ChEMBL Database**

## Installing libraries

In [72]:
pip install chembl_webresource_client # This library will allow to download the biological activity data directly from the chembl database



## Importing necessary libraries


In [73]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## **Search for Target protein**

## **Target search for cancer**

### KRAS (Kirsten Rat Sarcoma Viral Oncogene Homolog) is a small GTPase protein that acts as a molecular switch in cell signaling, controlling cell growth, differentiation, and survival. Mutations in KRAS, such as G12C, are linked to cancers, making it a key target for drug bioactivity studies.

In [74]:
#target = new_client.target
target_query = new_client.target.search("KRAS")
targets = pd.DataFrame.from_dict(target_query) # from_dic tells the  pandas that data isin dictionary format
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Homo sapiens,GTPase KRas,19.0,False,CHEMBL2189121,"[{'accession': 'P01116', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Homo sapiens,Protein cereblon-KRAS,19.0,False,CHEMBL5483196,"[{'accession': 'P01116', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
2,[],Homo sapiens,von Hippel-Lindau disease tumor suppressor/KRAS,18.0,False,CHEMBL5169273,"[{'accession': 'P01116', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
3,[],Homo sapiens,SOS1-KRAS,18.0,False,CHEMBL5465393,"[{'accession': 'Q07889', 'component_descriptio...",PROTEIN COMPLEX,9606
4,[],Homo sapiens,PDE6D/KRAS,17.0,False,CHEMBL4523623,"[{'accession': 'O43924', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
5,[],Homo sapiens,GTPase KRas/RAF1,17.0,False,CHEMBL5291977,"[{'accession': 'P04049', 'component_descriptio...",PROTEIN-PROTEIN INTERACTION,9606
6,[],Homo sapiens,RAS,15.0,False,CHEMBL4524006,"[{'accession': 'P01112', 'component_descriptio...",PROTEIN FAMILY,9606


### Single protein target for further investigation

## Retrieve bioactivity data for the cancer-associated GTPase KRAS (first entry)

In [75]:
selected_target = targets.target_chembl_id[0] # assign first entry to the selected_target variable
selected_target

'CHEMBL2189121'

In [76]:
res = new_client.activity.filter(target_chembl_id = selected_target).filter(standard_type ="IC50") # IC50 standard type filter is to make the data uniform by excluding EC50 and percent activity std types.

In [77]:
df = pd.DataFrame.from_dict(res)

In [78]:
df.head(3)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,13352855,[],CHEMBL2399318,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,155.0
1,,,13352856,[],CHEMBL2399319,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,342.0
2,,,14548911,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,1.2


In [79]:
# Sanity check if the data has not onlt IC50 std type
df.standard_type.unique()

array(['IC50'], dtype=object)

### Saving the bioactivity data to a  csv format named **bioactivity_data.csv**

In [80]:
df.to_csv("bioactivity_data.csv", index=False) # False prevents the default Python (pandas) index from being added to the data.

### Copying file to Google Drive

In [81]:
# Mount Drive
from google.colab import drive
drive.mount("/content/drive", force_remount=True) #Here, force_remount unmounts the Drive if it’s already connected and then remounts it, ensuring any recent changes in Drive are refreshed and accessible.

Mounted at /content/drive


### Shell-based commands

In [82]:
# Creating Data folder in Colab Notebooks on google drive
! mkdir "/content/drive/My Drive/Colab Notebooks/Data" #Exclamation mark is required for running shell-based commands

mkdir: cannot create directory ‘/content/drive/My Drive/Colab Notebooks/Data’: File exists


In [83]:
# Sanity check
! ls  # list the contents of the current opened folder.
! ls "/content/drive/My Drive/Colab Notebooks/"
! ls -l "/content/drive/My Drive/Colab Notebooks/" # dash l shows the time and data at which it is created

bioactivity_data.csv  drive  sample_data
 CDD_Download_Bioactivity_Data.ipynb   ML_Classification.ipynb
 Data				       Untitled0.ipynb
 First_Project.ipynb		       Untitled1.ipynb
'ML_Classification (1).ipynb'	       Untitled2.ipynb
total 392
-rw------- 1 root root  25977 Dec 27 07:27  CDD_Download_Bioactivity_Data.ipynb
drwx------ 2 root root   4096 Dec 28 08:55  Data
-rw------- 1 root root 268271 Dec 25 12:41  First_Project.ipynb
-rw------- 1 root root   1005 Dec 25 18:05 'ML_Classification (1).ipynb'
-rw------- 1 root root  85254 Dec 26 03:47  ML_Classification.ipynb
-rw------- 1 root root    773 May 26  2021  Untitled0.ipynb
-rw------- 1 root root   3231 May 26  2021  Untitled1.ipynb
-rw------- 1 root root  10881 Nov 11  2024  Untitled2.ipynb


In [84]:
# Copying Data to Drive
! cp bioactivity_data.csv "/content/drive/My Drive/Colab Notebooks/Data"

In [85]:
! head bioactivity_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,13352855,[],CHEMBL2399318,Inhibition of full-length human KRas4B (amino acids 1 to 188)-SOS interaction assessed as inhibition of SOS-mediated nucleotide release activity,B,,,BAO_0000190,BAO_0000357,single protein format,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,Outside typical range,"Values for this activity type are u

## **Handling missing data**

### Drop any compounds that has missing value for the standard value column

In [86]:
df2 = df[df.standard_value.notna()] # gives filtered dataframe
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,13352855,[],CHEMBL2399318,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,155.0
1,,,13352856,[],CHEMBL2399319,Inhibition of full-length human KRas4B (amino ...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,342.0
2,,,14548911,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,1.2
3,,,14548912,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,10.0
4,,,14548913,[],CHEMBL3223233,Inhibition of recombinant HA-tagged K-Ras G12V...,B,P01116,G12V,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,uM,UO_0000065,,1.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5755,,1295053,29050673,[],CHEMBL5739527,Scintillation Proximity Assay (SPA): Assays we...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,nM,UO_0000065,,1400.0
5756,,1295054,29050676,[],CHEMBL5739527,Scintillation Proximity Assay (SPA): Assays we...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,nM,UO_0000065,,43000.0
5757,,1295055,29050679,[],CHEMBL5739527,Scintillation Proximity Assay (SPA): Assays we...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,nM,UO_0000065,,2100.0
5758,,1295056,29050682,[],CHEMBL5739527,Scintillation Proximity Assay (SPA): Assays we...,B,,,BAO_0000190,...,Homo sapiens,GTPase KRas,9606,,,IC50,nM,UO_0000065,,9500.0


In [87]:
# sanity check fot a specific column
print(df2["standard_type"].isna().sum()) # gives the count for column "standard_type"
print(df2["standard_type"].isna().any()) # Boolean: True if column A has NaN

# sanity check across all columns
#print(df2.isna().any().any()) # Returns single true.isna() → marks NaN as True. First .any() → checks column-wise.Second .any() → checks across all columns.
#print(df2.isna().sum()) # gives count for all columns individually.

#Filter columns (prints columns which have NAN)
# print(df2.loc[:,df2.isna().any()]) # When filtering columns, .loc with : is required because df2[..] selects rows by default.

#Filter Rows (prints rows which have NAN)
#print(df2[df2.isna().any(axis=1)]) # rows that contain at least one NaN.

0
False


## Data pre-processing of the bioactivity data

### Labeling compounds as either being active, inactive or intermediate

### The bioactivity data is in IC50 unit. Compounds having values of less than 1000nM will be considered to be **active** while those greater than 10,000 will be considered to be **inactive**. As for those values in between 1000 and 10,000 nM will be referred to be as **intermediate**. (1 micromolar = 1000 nanomolar (nM) )

In [89]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [91]:
#bioactivity_class

## Iterate the molecule_chembl_id to a list

In [97]:
mol_cid =[]
for i in df2.molecule_chembl_id:
  mol_cid.append(i)



## Iterate canonical_smiles to a list

In [92]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

## Iterate standrad_value to a list

In [93]:
standard_value = []
for i in df.standard_value:
  standard_value.append(i)

In [98]:
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value)) # Zip, creates tuples row-wise. list, collects all tuples into a list.
df3 = pd.DataFrame(data_tuples, columns =["molecule_chembl_id", "canonical_smiles", "bioactivity_data", "standard_value"])

In [99]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_data,standard_value
0,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,inactive,155000.0
1,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,inactive,342000.0
2,CHEMBL3218635,CC(C)[C@@H]1NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...,intermediate,1200.0
3,CHEMBL3218636,CCCC[C@H]1NC(=O)[C@@H](Cc2ccc3ccccc3c2)NC(=O)[...,inactive,10000.0
4,CHEMBL3218637,CC1(C)COC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CCCNC...,intermediate,1800.0
...,...,...,...,...
5733,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,intermediate,13500.0
5734,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,inactive,2300.0
5735,CHEMBL5826085,,intermediate,25500.0
5736,CHEMBL5826085,,intermediate,3500.0


## Alternative method

In [103]:
selection = ["molecule_chembl_id", "canonical_smiles", "standard_value"]
df3 = df2[selection]
# or
pd.concat([df3, pd.Series(bioactivity_class)], axis=1) # pd.Series: concat expects in series or DataFrame format
df3.columns["0"] =["bioactivity_class"]
df3

TypeError: Index does not support mutable operations

In [95]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,155000.0
1,CHEMBL2396992,Cc1[nH]c2cc(Cl)cc(Cl)c2c1CCN,342000.0
2,CHEMBL3218635,CC(C)[C@@H]1NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...,1200.0
3,CHEMBL3218636,CCCC[C@H]1NC(=O)[C@@H](Cc2ccc3ccccc3c2)NC(=O)[...,10000.0
4,CHEMBL3218637,CC1(C)COC(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CCCNC...,1800.0
...,...,...,...
5755,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,1400.0
5756,CHEMBL6000934,C=CC(=O)N1CC2(CC(n3nc(-c4cc(C)nn4CCN(C)C)c(-c4...,43000.0
5757,CHEMBL5826085,,2100.0
5758,CHEMBL5826085,,9500.0


###