<a href="https://colab.research.google.com/github/Promilasharan/A-Deep-Learning-Approach-CNN-for-Predicting-BRD4-Inhibitors/blob/main/Deep_learning_for_predicting_BRD4_inhibitors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [175]:
import tensorflow as tf

In [176]:
#BRD4 - Bromo-domain containing 4 inhibitors

# steps:
## Fetch Data from CheMBL database
## Exploratory Data Analysis
## Exploratory Molecular Data analysis
## Feature Engineering (Genearting molecular fingerprints)
## Dataset preparation for CNN
## Build model
## Train model
## save model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [177]:
import pandas as pd
import numpy as np


In [179]:
# Read the dataset
df = pd.read_csv("brd4.csv", sep=";", engine="python",encoding='utf-8', on_bad_lines='skip')

In [180]:
df.head(5)

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type
0,CHEMBL1232461,MOLIBRESIB,2.0,423.9,0,3.66,"I-BET, GSK525762A",CCNC(=O)C[C@@H]1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-...,IC50,'=',...,Homo sapiens,SINGLE PROTEIN,CHEMBL1955852,1,Scientific Literature,Bioorg Med Chem,2012.0,,,
1,CHEMBL1738926,,,423.48,0,4.35,GW-841819,Cc1nnc2n1-c1ccccc1C(c1ccccc1)=N[C@H]2NC(=O)OCc...,Kd,'=',...,Homo sapiens,SINGLE PROTEIN,CHEMBL1955852,1,Scientific Literature,Bioorg Med Chem,2012.0,,,
2,CHEMBL2430877,,,472.94,0,4.55,28c,COc1ccc2c(c1)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)Nc1...,IC50,'=',...,Homo sapiens,SINGLE PROTEIN,CHEMBL2429783,1,Scientific Literature,J Med Chem,2013.0,,,
3,CHEMBL2430894,,,453.5,0,4.36,18f,COc1cccc(C2=NC(NC(=O)OCc3ccccc3)c3nnc(C)n3-c3c...,IC50,'=',...,Homo sapiens,SINGLE PROTEIN,CHEMBL2429783,1,Scientific Literature,J Med Chem,2013.0,,,
4,CHEMBL3356139,,,380.45,0,3.12,5,COc1ccccc1C(=O)Nc1cc2c(cc1N1CCCCC1)[nH]c(=O)n2C,IC50,'=',...,Homo sapiens,SINGLE PROTEIN,CHEMBL3352488,1,Scientific Literature,ACS Med Chem Lett,2014.0,,,


In [181]:
df.shape

(4907, 46)

In [182]:
df.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'Action Type'],
      dtype='object')

In [183]:
df=df[df["pChEMBL Value"]>0]

In [184]:
df=df[['Molecule ChEMBL ID','Smiles','Standard Type', 'Standard Relation', 'Standard Value','Standard Units']]

In [185]:
df=df[df["Smiles"].notna()]

In [186]:
df=df[df["Standard Value"].notna()]

In [187]:
df.shape

(3270, 6)

In [188]:
df['Molecule ChEMBL ID'].value_counts()

CHEMBL1957266    66
CHEMBL1232461    27
CHEMBL2393130    18
CHEMBL2017291    16
CHEMBL513909     10
                 ..
CHEMBL4103140     1
CHEMBL2179388     1
CHEMBL2179378     1
CHEMBL4577486     1
CHEMBL4083840     1
Name: Molecule ChEMBL ID, Length: 2455, dtype: int64

In [189]:
df["newValue"]=df[['Molecule ChEMBL ID','Smiles','Standard Type', 'Standard Relation', 'Standard Value','Standard Units']].groupby(["Molecule ChEMBL ID"]).transform("mean")

  X["newValue"]=X[['Molecule ChEMBL ID','Smiles','Standard Type', 'Standard Relation', 'Standard Value','Standard Units']].groupby(["Molecule ChEMBL ID"]).transform("mean")


In [190]:
df.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,newValue
0,CHEMBL1232461,CCNC(=O)C[C@@H]1N=C(c2ccc(Cl)cc2)c2cc(OC)ccc2-...,IC50,'=',36.1,nM,183.661481
1,CHEMBL1738926,Cc1nnc2n1-c1ccccc1C(c1ccccc1)=N[C@H]2NC(=O)OCc...,Kd,'=',52.5,nM,139.4
2,CHEMBL2430877,COc1ccc2c(c1)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)Nc1...,IC50,'=',398.11,nM,398.11
3,CHEMBL2430894,COc1cccc(C2=NC(NC(=O)OCc3ccccc3)c3nnc(C)n3-c3c...,IC50,'=',1000.0,nM,1000.0
4,CHEMBL3356139,COc1ccccc1C(=O)Nc1cc2c(cc1N1CCCCC1)[nH]c(=O)n2C,IC50,'=',1584.89,nM,1584.89


In [191]:
df['Standard Units'].value_counts()

nM    3270
Name: Standard Units, dtype: int64

In [192]:
df=df.sort_values("newValue", ascending=True)

In [193]:
df=df.drop_duplicates("Molecule ChEMBL ID")

In [194]:
df.shape

(2455, 7)

In [195]:
actives=df.loc[df["newValue"]<=5000]

In [196]:
actives.shape


(1961, 7)

In [197]:
inactives=df.loc[df["newValue"]>=10000]

In [198]:
inactives.shape


(330, 7)

In [None]:
#we have imbalanced datset corresponding to actives and inactives molecules

In [199]:
actives["label"]=1
inactives["label"]=0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actives["label"]=1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inactives["label"]=0


In [200]:
combined=pd.concat([actives,inactives],axis=0)

In [153]:
combined

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,newValue,label
765,CHEMBL3895096,Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)N/N=...,IC50,'=',0.24,nM,0.24,1
3177,CHEMBL3898684,CCS(=O)(=O)Nc1ccc(Oc2ccccc2)c(-c2cn(C)c(=O)c3[...,Ki,'=',0.25,nM,0.32,1
2245,CHEMBL4166630,COc1cc2c(cc1-c1c(C)noc1C)[nH]c1nc(C)nc(Nc3cc(C...,Ki,'=',0.50,nM,0.50,1
4047,CHEMBL4435166,CCNC(=O)c1cc2c(-c3cc(NC(C)=O)ccc3Oc3c(C)cccc3C...,Ki,'=',0.50,nM,0.50,1
4234,CHEMBL4555714,Cc1cc(F)cc(C)c1Oc1ccc(C(C)(C)O)cc1-c1cn(C)c(=O...,Ki,'=',0.50,nM,0.50,1
...,...,...,...,...,...,...,...,...
1396,CHEMBL4547160,Cc1cc(OC2CCN(C(=O)C3CCC3)CC2)cc(=O)o1,Kd,'=',960000.00,nM,960000.00,0
3450,CHEMBL4788775,CN1CC(c2ccccc2)CC1=O,IC50,'=',1140000.00,nM,1140000.00,0
4905,CHEMBL4538721,COC(=O)CC1CC(=O)N(C)C1,IC50,'=',2120000.00,nM,2120000.00,0
1529,CHEMBL12543,CN1CCCC1=O,IC50,'=',2660000.00,nM,2660000.00,0


In [201]:
combined[["Smiles", "label"]].to_csv("brd4_binary.smi", sep="\t", header=None,index=None)

In [202]:
!pip install rdkit



In [203]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole

In [204]:
t1=Chem.SmilesMolSupplier("brd4_binary.smi", delimiter="\t", titleLine=False)

In [205]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.preprocessing import LabelEncoder

In [206]:
# Assuming you already have the FP and IDs lists as you mentioned:
FP, IDs = [], []
for i, mol in enumerate(t1):
    if mol is not None:
        FP.append(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048))
        IDs.append(mol.GetProp("_Name"))



In [207]:
# Convert fingerprints (FP) list to a 2D array
X = np.array([list(fp.ToBitString()) for fp in FP], dtype=np.float32)
y = np.array(IDs)



In [208]:
# Encode the labels (IDs) to integers using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)



In [211]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Normalize the input data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [212]:
# Reshape the input data to match CNN input requirements
X_train = X_train.reshape(X_train.shape[0], 2048, 1)
X_test = X_test.reshape(X_test.shape[0], 2048, 1)


In [219]:
# Build the CNN model
model = keras.Sequential([
    layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(2048, 1)),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(label_encoder.classes_), activation='sigmoid')  # Number of units matches the number of classes
])

In [220]:
# Compile the model with binary_crossentropy loss
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping callback
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)



# New section

In [221]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_13 (Conv1D)          (None, 2046, 32)          128       
                                                                 
 max_pooling1d_10 (MaxPoolin  (None, 1023, 32)         0         
 g1D)                                                            
                                                                 
 flatten_10 (Flatten)        (None, 32736)             0         
                                                                 
 dense_14 (Dense)            (None, 128)               4190336   
                                                                 
 dropout_6 (Dropout)         (None, 128)               0         
                                                                 
 dense_15 (Dense)            (None, 2)                 258       
                                                      

In [222]:
# Train the model with early stopping
model.fit(X_train, keras.utils.to_categorical(y_train), epochs=100, batch_size=32, validation_data=(X_test, keras.utils.to_categorical(y_test)), callbacks=[early_stopping])



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<keras.callbacks.History at 0x7b276f58c4f0>

In [223]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, keras.utils.to_categorical(y_test))
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9084967374801636


In [225]:
predicted=model.predict(X_test)



In [235]:
#save the model
model.save("brd4")

