<a href="https://colab.research.google.com/github/Ratul2200/Drug_repurposing-for-SARS_CoV2/blob/main/Vina_score_predictor_chembl_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basic Libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
%matplotlib inline

# For ML
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

In [None]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2021.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 1.2 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2021.9.4


In [None]:
from rdkit import rdBase
from rdkit import Chem
print(rdBase.rdkitVersion)

2021.09.4


*Install the ChEMBL web service package so that we can retrieve bioactivity data from the ChEMBL Database.*

In [None]:
! pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.7-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.4 MB/s 
[?25hCollecting requests-cache~=0.7.0
  Downloading requests_cache-0.7.5-py3-none-any.whl (39 kB)
Collecting pyyaml>=5.4
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.0 MB/s 
[?25hCollecting itsdangerous>=2.0.1
  Downloading itsdangerous-2.0.1-py3-none-any.whl (18 kB)
Collecting url-normalize<2.0,>=1.4
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, pyyaml, itsdangerous, requests-cache, chembl-webresource-client
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
  Attempting uninstall: itsdangerous
    Found existing installat

In [None]:
from chembl_webresource_client.new_client import new_client

#**Searching for target protein**

###Target search for SARS_CoV2 main protease

In [None]:
# Target search for sars main protease
target = new_client.target
target_query = target.search('sars')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
1,[],Severe acute respiratory syndrome-related coro...,SARS-CoV,15.0,False,CHEMBL4303836,[],ORGANISM,694009
2,[],Homo sapiens,"Serine--tRNA ligase, cytoplasmic",14.0,False,CHEMBL4523232,"[{'accession': 'P49591', 'component_descriptio...",SINGLE PROTEIN,9606
3,[],Severe acute respiratory syndrome coronavirus 2,SARS-CoV-2,13.0,False,CHEMBL4303835,[],ORGANISM,2697049
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,11.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
6,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [None]:
# select and retrieve required data
selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL4523582'

In [None]:
# Retrieve only bioactivity data for coronavirus main protease (CHEMBL4523582) that are reported as % inhibition values.
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="Inhibition")
df = pd.DataFrame.from_dict(res)
print('Shape of the data:', df.shape)
df.head()

Shape of the data: (8702, 45)


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,19955497,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,Cc1cc(C)cc(OCC2CNC(=O)O2)c1,,,CHEMBL4495564,,2020,,CHEMBL1079604,METAXALONE,CHEMBL1079604,,False,http://qudt.org/vocab/unit#Percent,3341915,=,52,True,=,,Inhibition,%,,29.96,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,29.96
1,,19955498,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CN1[C@H]2C[C@H](OC(=O)[C@H](CO)c3ccccc3)C[C@@H...,,,CHEMBL4495564,,2020,,CHEMBL2165224,ANISODAMINE,CHEMBL2165224,,False,http://qudt.org/vocab/unit#Percent,3341916,=,52,True,=,,Inhibition,%,,21.72,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,21.72
2,,19955499,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CC1=N[C@H](C(=O)O)[C@@H](O)CN1,,,CHEMBL4495564,,2020,,CHEMBL1230488,,CHEMBL1230488,,False,http://qudt.org/vocab/unit#Percent,3341917,=,52,True,=,,Inhibition,%,,24.9,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,24.9
3,,19955500,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CCCC(=O)O[C@]1(C(=O)CO)CC[C@H]2[C@@H]3CCC4=CC(...,,,CHEMBL4495564,,2020,,CHEMBL1683,HYDROCORTISONE BUTYRATE,CHEMBL1683,,False,http://qudt.org/vocab/unit#Percent,3341918,=,52,True,=,,Inhibition,%,,38.63,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,38.63
4,,19955501,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CC(=O)Nc1ccc(CC(=O)O)cc1,,,CHEMBL4495564,,2020,,CHEMBL1885632,ACTARIT,CHEMBL1885632,,False,http://qudt.org/vocab/unit#Percent,3341919,=,52,True,=,,Inhibition,%,,22.96,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,22.96


###**Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
print(os.getcwd())

/content


In [None]:
% cd /content/gdrive/MyDrive/Datasets/Drug Repurposing

/content/gdrive/MyDrive/Datasets/Drug Repurposing


In [None]:
! ls

supercompf.csv	vina_scores_2.csv  vina_scores.csv


In [None]:
# save the dataset to the drive
df.to_csv('chembl_dataset.csv', index=False)

In [None]:
! ls -l

total 7126
-rw------- 1 root root 4392320 Feb  4 18:36 chembl_dataset.csv
-rw------- 1 root root 1569503 Jan 24 16:08 supercompf.csv
-rw------- 1 root root  667051 Feb  2 14:39 vina_scores_2.csv
-rw------- 1 root root  667051 Feb  1 09:50 vina_scores.csv


## Preprocessing Data

In [None]:
# drop data points with missing standard values
df2 = df[df.standard_value.notna()]
print(df2.shape)
df2.head()

(8702, 45)


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,19955497,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,Cc1cc(C)cc(OCC2CNC(=O)O2)c1,,,CHEMBL4495564,,2020,,CHEMBL1079604,METAXALONE,CHEMBL1079604,,False,http://qudt.org/vocab/unit#Percent,3341915,=,52,True,=,,Inhibition,%,,29.96,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,29.96
1,,19955498,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CN1[C@H]2C[C@H](OC(=O)[C@H](CO)c3ccccc3)C[C@@H...,,,CHEMBL4495564,,2020,,CHEMBL2165224,ANISODAMINE,CHEMBL2165224,,False,http://qudt.org/vocab/unit#Percent,3341916,=,52,True,=,,Inhibition,%,,21.72,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,21.72
2,,19955499,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CC1=N[C@H](C(=O)O)[C@@H](O)CN1,,,CHEMBL4495564,,2020,,CHEMBL1230488,,CHEMBL1230488,,False,http://qudt.org/vocab/unit#Percent,3341917,=,52,True,=,,Inhibition,%,,24.9,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,24.9
3,,19955500,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CCCC(=O)O[C@]1(C(=O)CO)CC[C@H]2[C@@H]3CCC4=CC(...,,,CHEMBL4495564,,2020,,CHEMBL1683,HYDROCORTISONE BUTYRATE,CHEMBL1683,,False,http://qudt.org/vocab/unit#Percent,3341918,=,52,True,=,,Inhibition,%,,38.63,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,38.63
4,,19955501,[],CHEMBL4495582,SARS-CoV-2 3CL-Pro protease inhibition percent...,F,,,BAO_0000201,BAO_0000019,assay format,CC(=O)Nc1ccc(CC(=O)O)cc1,,,CHEMBL4495564,,2020,,CHEMBL1885632,ACTARIT,CHEMBL1885632,,False,http://qudt.org/vocab/unit#Percent,3341919,=,52,True,=,,Inhibition,%,,22.96,CHEMBL4523582,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,Inhibition,%,UO_0000187,,22.96


In [None]:
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
drug_dataset = df2[selection]
print(drug_dataset.shape)
drug_dataset.head(10)

(8702, 3)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL1079604,Cc1cc(C)cc(OCC2CNC(=O)O2)c1,29.96
1,CHEMBL2165224,CN1[C@H]2C[C@H](OC(=O)[C@H](CO)c3ccccc3)C[C@@H...,21.72
2,CHEMBL1230488,CC1=N[C@H](C(=O)O)[C@@H](O)CN1,24.9
3,CHEMBL1683,CCCC(=O)O[C@]1(C(=O)CO)CC[C@H]2[C@@H]3CCC4=CC(...,38.63
4,CHEMBL1885632,CC(=O)Nc1ccc(CC(=O)O)cc1,22.96
5,CHEMBL227744,N=C(NCCC[C@H](N)C(=O)O)N[N+](=O)[O-],23.98
6,CHEMBL1021,NC(=O)Cc1cccc(C(=O)c2ccccc2)c1N,8.62
7,CHEMBL243712,CCN1CCCC1CNC(=O)c1cc(S(=O)(=O)CC)c(N)cc1OC,26.31
8,CHEMBL422648,Cc1oc(=O)oc1CN1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)c...,14.67
9,CHEMBL398435,CCCSc1nc(N[C@@H]2C[C@H]2c2ccc(F)c(F)c2)c2nnn([...,34.26


In [None]:
drug_dataset.to_csv('chembl_dataset_preprocessed.csv', index=False)