In [1]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [2]:
from Bio.PDB import PDBList

pdbl = PDBList()
pdbl.retrieve_pdb_file("1TUP", pdir=".", file_format="pdb")  # Example: p53 protein (1TUP)


Downloading PDB structure '1tup'...


'./pdb1tup.ent'

In [3]:
from Bio.PDB import PDBParser
import pandas as pd

# Initialize parser
parser = PDBParser(QUIET=True)

# Parse the structure from the downloaded PDB file
structure = parser.get_structure("p53_protein", "/content/pdb1tup.ent")

# Collect atom data
data = []
for model in structure:
    for chain in model:
        for residue in chain:
            for atom in residue:
                data.append({
                    "model_id": model.id,
                    "chain_id": chain.id,
                    "residue_name": residue.resname,
                    "residue_id": residue.id[1],
                    "atom_name": atom.name,
                    "element": atom.element,
                    "x": atom.coord[0],
                    "y": atom.coord[1],
                    "z": atom.coord[2],
                    "occupancy": atom.occupancy,
                    "b_factor": atom.bfactor
                })

# Convert to DataFrame
df = pd.DataFrame(data)

# Display first few rows of the DataFrame
print(df.head())


   model_id chain_id residue_name  residue_id atom_name element          x  \
0         0        E           DT        1001       O5'       O  25.930000   
1         0        E           DT        1001       C5'       C  25.549000   
2         0        E           DT        1001       C4'       C  25.636999   
3         0        E           DT        1001       O4'       O  24.711000   
4         0        E           DT        1001       C3'       C  26.966999   

       y          z  occupancy  b_factor  
0  6.224  81.661003        1.0     67.33  
1  7.533  82.107002        1.0     69.16  
2  8.573  80.994003        1.0     65.63  
3  9.640  81.304001        1.0     69.89  
4  9.318  80.880997        1.0     69.57  


25/05/25


In [6]:
! pip install chembl_webresource_client



In [7]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [8]:
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
2,[],Murine coronavirus,Murine coronavirus,14.0,False,CHEMBL5209664,[],ORGANISM,694005
3,[],Canine coronavirus,Canine coronavirus,14.0,False,CHEMBL5291668,[],ORGANISM,11153
4,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
5,[],Human coronavirus OC43,Human coronavirus OC43,13.0,False,CHEMBL5209665,[],ORGANISM,31631
6,[],Severe acute respiratory syndrome-related coro...,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,694009
7,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
8,[],Severe acute respiratory syndrome-related coro...,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,694009
9,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [11]:
selected_target = targets.target_chembl_id[4]
selected_target

'CHEMBL613837'

In [13]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type=
                                                               "IC50")

In [14]:
df = pd.DataFrame.from_dict(res)
df.head()

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,22804102,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4712028,Antiviral activity against HCoV-229E infected ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,1.7
1,,,24984054,"[{'comments': None, 'relation': '<=', 'result_...",CHEMBL5227023,Antiviral activity against recombinant HCoV-22...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,0.7
2,,,24992649,[],CHEMBL5230448,Antiviral activity against HCoV-229E,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,3.35
3,,,24992650,[],CHEMBL5230448,Antiviral activity against HCoV-229E,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,nM,UO_0000065,,1.48
4,,,24992680,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5230463,Antiviral activity against HCoV-229E assessed ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,5.1


In [15]:
df.standard_type.unique()

array(['IC50'], dtype=object)

In [16]:
df.to_csv('bioactivity_data.csv', index=False)

## Missing data

In [33]:
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,22804102,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL4712028,Antiviral activity against HCoV-229E infected ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,1.7
1,,,24984054,"[{'comments': None, 'relation': '<=', 'result_...",CHEMBL5227023,Antiviral activity against recombinant HCoV-22...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,0.7
2,,,24992649,[],CHEMBL5230448,Antiviral activity against HCoV-229E,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,3.35
3,,,24992650,[],CHEMBL5230448,Antiviral activity against HCoV-229E,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,nM,UO_0000065,,1.48
4,,,24992680,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5230463,Antiviral activity against HCoV-229E assessed ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,5.1
5,,,25080001,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5255429,Antiviral activity against HCoV-229E harboring...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,18.1
6,,,25457414,[],CHEMBL5320542,Antiviral activity against HCoV-229E infected ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,0.14
7,,,25457415,[],CHEMBL5320542,Antiviral activity against HCoV-229E infected ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,0.14
8,,,25516075,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336144,Antiviral activity against hCoV-229E infected ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,1.34
9,,,25516076,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5336144,Antiviral activity against hCoV-229E infected ...,F,,,BAO_0000190,...,Human coronavirus 229E,Human coronavirus 229E,11137,,,IC50,uM,UO_0000065,,0.04


In [17]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [21]:
! ls "/content/gdrive/My Drive/Colab Notebooks/data"

In [20]:
mkdir "/content/gdrive/My Drive/Colab Notebooks/data"

In [23]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
! cp bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/data"

In [26]:
! ls "/content/gdrive/My Drive/Colab Notebooks/data"

bioactivity_data.csv


In [27]:
! head bioactivity_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,22804102,"[{'comments': None, 'relation': '=', 'result_flag': 0, 'standard_relation': '=', 'standard_text_value': None, 'standard_type': 'TIME', 'standard_units': 'hr', 'standard_value': '2.0', 'text_value': None, 'type': 'Time', 'units': 'hrs', 'value': '2.0'}]",CHEMBL4712028,Antiviral activity against HCo

# Data preprocessing

In [34]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

In [35]:
mol_cid

['CHEMBL4763715',
 'CHEMBL5270175',
 'CHEMBL5283406',
 'CHEMBL91704',
 'CHEMBL217092',
 'CHEMBL5278230',
 'CHEMBL1230135',
 'CHEMBL213054',
 'CHEMBL4590273',
 'CHEMBL2365410',
 'CHEMBL5182995',
 'CHEMBL5185801']

In [37]:
canomical_smiles = []
for i in df2.canonical_smiles:
  canomical_smiles.append(i)

canomical_smiles

['C[C@H]1O[C@@](O)(O[C@H]2CC[C@@]3(C)[C@@H](CC[C@]4(C)[C@@H]3C=CC3=C5CC(C)(C)CC[C@]5(CO)[C@H](O)C[C@]34C)[C@]2(C)CO)[C@H](O)[C@@H](O[C@@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2O)[C@H]1O',
 'O=C(NC(Cc1ccccc1)C(=O)N1CCC(O)(c2ccc(Br)cc2)CC1)c1ccccc1',
 'CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCSC)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](N)CO)[C@@H](C)CC)C(C)C)[C@@H](C)O)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC(C)C)C(=O)O)[C@@H](C)CC',
 'CN1CCN(C(=O)N[C

In [40]:

standard_value =[]
for i in df2.standard_value:
  standard_value.append(i)
standard_value

['1700.0',
 '700.0',
 '3350.0',
 '1.48',
 '5100.0',
 '18100.0',
 '140.0',
 '140.0',
 '1340.0',
 '40.0',
 '13.0',
 '24.0']

Alternative Method

In [41]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL4763715,C[C@H]1O[C@@](O)(O[C@H]2CC[C@@]3(C)[C@@H](CC[C...,1700.0
1,CHEMBL5270175,O=C(NC(Cc1ccccc1)C(=O)N1CCC(O)(c2ccc(Br)cc2)CC...,700.0
2,CHEMBL5283406,CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@H](CCC(...,3350.0
3,CHEMBL91704,CN1CCN(C(=O)N[C@@H](Cc2ccccc2)C(=O)N[C@H](/C=C...,1.48
4,CHEMBL217092,CN1CCN(CCOc2cc(OC3CCOCC3)c3c(Nc4c(Cl)ccc5c4OCO...,5100.0
5,CHEMBL5278230,C=C(C=O)[C@H]1CC=C2[C@H](OC(=O)/C(C)=C/CCCCCCC...,18100.0
6,CHEMBL1230135,Cc1cc(C(=O)N[C@@H](C)C(=O)N[C@H](C(=O)N[C@@H](...,140.0
7,CHEMBL213054,CC(OC(C)(C)C)[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@...,140.0
8,CHEMBL4590273,Cc1cccc2nc(CSC(=S)NCc3cccnc3)cn12,1340.0
9,CHEMBL2365410,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,40.0


In [None]:
pd.concat(df3, pd.Series(bio))