# Affinity & Solubility prediction modeling using ChemBL database: A Machine-learning based approach (Data Pre-processing) 

The downloaded data from ChEMBL database is not amenable for modelling. Hence, a sequence of pre-processing steps are presented next to prepare dataset for affinity prediction modeling.

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from rdkit import Chem, DataStructs

1.Import downloaded data from ChEMBL database

In [3]:
Compd_struct_df=pd.read_csv('Compound_Structure.csv')
Compd_prop_df=pd.read_csv('Compound_Properties.csv')
Activities_df=pd.read_csv('Activities.csv')
Assay_df=pd.read_csv('Assay.csv')
Target_comp_df=pd.read_csv('Target_Component.csv')
Target_comp_seq_df=pd.read_csv('Target_Component_Sequences.csv')
Bio_comp_seq_df=pd.read_csv('Bio_Component_Sequences.csv')
Bio_comp_df=pd.read_csv('Bio_Component.csv')
Biotherapeutic_df=pd.read_csv('Biotherapeutic.csv')

In [4]:
# Exclude unwanted columns from the Pandas Dataframes
Compd_struct_df=Compd_struct_df.drop('Unnamed: 0',axis=1)
Compd_prop_df=Compd_prop_df.drop('Unnamed: 0',axis=1)
Activities_df=Activities_df.drop('Unnamed: 0',axis=1)
Activities_df=Activities_df.groupby(['assay_id']).first() # remove duplicate assay IDs
Assay_df=Assay_df.drop('Unnamed: 0',axis=1)
Assay_df=Assay_df.groupby(['assay_id','chembl_id']).first() # remove duplicate assay IDs
Target_comp_df=Target_comp_df.drop('Unnamed: 0',axis=1)
Target_comp_seq_df=Target_comp_seq_df.drop('Unnamed: 0',axis=1)
Bio_comp_seq_df=Bio_comp_seq_df.drop('Unnamed: 0',axis=1)
Bio_comp_df=Bio_comp_df.drop('Unnamed: 0',axis=1)
Biotherapeutic_df=Biotherapeutic_df.drop('Unnamed: 0',axis=1)

2. Converting Canonical SMILES strings to Molecular fingerprints using RDKIT Package

In [None]:
def MolecularFingerPrint(Final_df):
    np_fps=[]
    fp_len=1024
    L=len(Final_df)
    for i in range(L):
        mol = Chem.MolFromSmiles(Final_df[i:i+1]['canonical_smiles'].values[0])
        try:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol,2, nBits=fp_len)
            arr = np.zeros((0,), dtype=np.int8)
            DataStructs.ConvertToNumpyArray(fp,arr)
        except:
            arr=np.zeros((1,fp_len))
            arr.fill(np.nan)          
        np_fps.append(arr)
    return np_fps

3. Target component sequence data pre-processing

Two-dimensional PCA approach was used to covert target protein sequence to numeric data 
(Two-Dimensional PCA: A New Approach to Appearance-Based Face Representation and Recognition
January 2004IEEE Transactions on Pattern Analysis and Machine Intelligence 26(1)DOI: 10.1109/TPAMI.2004.1261097)

In [None]:
Protein_seq=Target_comp_seq_df[Target_comp_seq_df.component_type=="PROTEIN"]
Protein_seq=Protein_seq.dropna()

seq_size=list()
for seq in Protein_seq['sequence']:
    seq_size.append(len(seq))
print (np.max(seq_size))
print (np.min(seq_size))
print (np.median(seq_size))
med_seq_len=np.median(seq_size)

seq_img_avg=np.zeros((21,int(med_seq_len)))
for seq_img in seq_imgs:
    seq_img_avg=np.add(seq_img_avg,seq_img)
    
seq_img_avg=seq_img_avg/len(seq_imgs)

G=np.zeros((int(med_seq_len),int(med_seq_len)))
for seq_img in seq_imgs:
    G=np.add(G,np.dot((seq_img-seq_img_avg).T,seq_img-seq_img_avg))

G=G/len(seq_imgs)
eigval,eigvec=np.linalg.eig(G)
idx=np.argsort(eigval)
idx=idx[::-1]
eigval=eigval[idx]
eigvec=eigvec[:,idx]
print (eigval[:10])

# extract features from image
nr=2 # reduced dimensions
red_feature_vec_list0=np.empty((1,21*nr))
for seq_img in seq_imgs:
    red_feature_vec=np.dot(seq_img,eigvec[:,:nr]).reshape((1,21*nr))
    red_feature_vec_list0=np.append(red_feature_vec_list0,red_feature_vec,axis=0)

red_feature_vec_list=np.delete(red_feature_vec_list0,0,axis=0)
print (red_feature_vec_list.shape)
print(red_feature_vec_list[0,:])

df=pd.DataFrame(red_feature_vec_list)
Protein_seq.reset_index(drop=True,inplace=True)
Reduced_Seq=pd.concat([df,Protein_seq["component_id"]],axis=1,ignore_index=False)
print (Reduced_Seq.loc[0:10,"component_id"])
Reduced_Seq.to_csv('Reduced_Target_Sequence.csv')

ChEMBL database schema (chembl_tables_schema.docx) was used to merge different data tables.

4. Preparing dataset for Solubilty Prediction Modeling

In [12]:
Temp_df1=pd.merge(Activities_df,Compd_struct_df,on='molregno',how='inner')
Temp_df2=pd.merge(Temp_df1,Compd_prop_df,on='molregno')
Temp_df3=pd.merge(Activities_df,Assay_df,on='assay_id',how='inner')
Temp_df4=pd.merge(Temp_df2,Temp_df3,on=['molregno','record_id','standard_value','standard_units','standard_type','pchembl_value'],how='inner').groupby(['molregno','standard_type']).first()
Temp_df5=pd.merge(Temp_df4,Target_comp_df,on='tid')
Final_df_1=pd.merge(Temp_df5,Target_comp_seq_df,on='component_id')

fp_col_names_1=[]
for i in range(1024):
    fp_col_names_1.append("fp_"+str(i))
    
np_fps_1=MolecularFingerPrint(Final_df_1)
fps_df_1=pd.DataFrame(np_fps_1,columns=fp_col_names_1)
Combined_df_1=pd.concat([Final_df_1,fps_df_1],axis=1)
Combined_df_1=Combined_df_1.dropna(subset=['canonical_smiles','pchembl_value'])
Combined_df_1.to_csv('Combined_data_with_FP_solubility_modeling.csv')

5.Preparing dataset for Affinity Prediction Modeling

In [5]:
Temp_df6=pd.merge(Compd_struct_df,Biotherapeutic_df,on='molregno',how='inner')
Temp_df7=pd.merge(Activities_df,Temp_df6,on='molregno',how='inner')
Temp_df8=pd.merge(Temp_df7,Compd_prop_df,on='molregno')
Temp_df9=pd.merge(Activities_df,Assay_df,on='assay_id',how='inner')
Temp_df10=pd.merge(Temp_df8,Temp_df9,on=['molregno','record_id','standard_value','standard_units','standard_type','pchembl_value'],how='inner').groupby(['molregno','standard_type']).first()
Temp_df11=pd.merge(Temp_df10,Target_comp_df,on='tid')
Final_df_2=pd.merge(Temp_df11,Reduced_Seq,on='component_id')
Final_df_2=Final_df_2.dropna(subset=['pchembl_value','canonical_smiles','sequence'])

fp_col_names_2=[]
for i in range(1024):
    fp_col_names_2.append("fp_"+str(i))
    
np_fps_2=MolecularFingerPrint(Final_df_2)
fps_df_2=pd.DataFrame(np_fps_2,columns=fp_col_names_2)
Combined_df_2=pd.concat([Final_df_2,fps_df_2],axis=1)
Combined_df_2=Combined_df_2.dropna(subset=['canonical_smiles','pchembl_value'])
Combined_df.to_csv('Combined_data_with_FP_2PCA_affinity_modeling.csv')