In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Concatenate, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Bidirectional, LSTM, Flatten, GlobalAveragePooling1D, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
import random


# Load data
data = []  # list of lists of the form [smiles, sequence, pKd]

with open('drive/MyDrive/nlp (1)/dta_df.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # skip header
    for row in reader:
        triplet = []
        triplet.append(row[0])
        triplet.append(row[1])
        triplet.append(float(row[2]))
        data.append(triplet)

random.shuffle(data)



In [None]:
import numpy as np
# Separate data into inputs (SMILES and proteins) and labels
smiles = [triplet[0] for triplet in data]
proteins = [triplet[1] for triplet in data]
labels = [triplet[2] for triplet in data]

# Split data into train and test sets
split = int(0.9 * len(smiles))
train_smiles = smiles[:split]
test_smiles = smiles[split:]
train_proteins = proteins[:split]
test_proteins = proteins[split:]
train_labels = labels[:split]
test_labels = labels[split:]



In [None]:
# Tokenize smiles
tokenizer_smiles = Tokenizer(char_level=True)
tokenizer_smiles.fit_on_texts(train_smiles)
vocab_size_smiles = len(tokenizer_smiles.word_index)

train_sequences_smiles = tokenizer_smiles.texts_to_sequences(train_smiles)
train_padded_smiles = pad_sequences(train_sequences_smiles, truncating="post", padding="post", maxlen=85)

test_sequences_smiles = tokenizer_smiles.texts_to_sequences(test_smiles)
test_padded_smiles = pad_sequences(test_sequences_smiles, truncating="post", padding="post", maxlen=85)

# Tokenize proteins
tokenizer_proteins = Tokenizer(char_level=True)
tokenizer_proteins.fit_on_texts(train_proteins)
vocab_size_proteins = len(tokenizer_proteins.word_index)

train_sequences_proteins = tokenizer_proteins.texts_to_sequences(train_proteins)
train_padded_proteins = pad_sequences(train_sequences_proteins, truncating="post", padding="post", maxlen=1200)

test_sequences_proteins = tokenizer_proteins.texts_to_sequences(test_proteins)
test_padded_proteins = pad_sequences(test_sequences_proteins, truncating="post", padding="post", maxlen=1200)

train_smiles_array = np.array(train_padded_smiles)
test_smiles_array = np.array(test_padded_smiles)
train_proteins_array = np.array(train_padded_proteins)
test_proteins_array = np.array(test_padded_proteins)
train_labels_array = np.array(train_labels, dtype = "float32")
test_labels_array = np.array(test_labels, dtype = "float32")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Concatenate, Dot, Activation, Flatten, Dense
from tensorflow.keras.models import Model

# Define input layers
input_smiles = Input(shape=(85,))
input_proteins = Input(shape=(1200,))

# Embedding layers
embedding_dim = 128
vocab_size_smiles = len(tokenizer_smiles.word_index) + 1
vocab_size_proteins = len(tokenizer_proteins.word_index) + 1


embedding_smiles = Embedding(input_dim=vocab_size_smiles, output_dim=embedding_dim, input_length=85)(input_smiles)
embedding_proteins = Embedding(input_dim=vocab_size_proteins, output_dim=embedding_dim, input_length=1200)(input_proteins)

# Bi-LSTM layers
lstm_units = 128

lstm_smiles = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(embedding_smiles)
lstm_proteins = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(embedding_proteins)

# Attention mechanism
attention = Dot(axes=[2, 2])([lstm_smiles, lstm_proteins])
attention = Activation('softmax')(attention)

# Weighted sum
context = Dot(axes=[2, 1])([attention, lstm_proteins])
weighted_protein = Concatenate(axis=1)([context, lstm_smiles])


# Output layer
output = Dense(units=1, activation='linear')(weighted_protein)

# Create model
model = Model(inputs=[input_smiles, input_proteins], outputs=output)

# Compile model
model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=10, # how many epochs to wait before stopping
    restore_best_weights=True,
)

history = model.fit([train_smiles_array, train_proteins_array], train_labels_array,
                    epochs = 100, batch_size = 256, validation_data = ([test_smiles_array, test_proteins_array], test_labels_array), callbacks = [early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


In [None]:
loss = model.evaluate([test_smiles_array, test_proteins_array], test_labels_array)
print("Overall Loss:", loss)

Overall Loss: 0.7223649621009827


In [None]:
def predict_pKd(drug, protein):
  drug_sequence = tokenizer_smiles.texts_to_sequences([drug])
  drug_padded = pad_sequences(drug_sequence, truncating="post", padding="post", maxlen=85)
  protein_sequence = tokenizer_proteins.texts_to_sequences([protein])
  protein_padded = pad_sequences(protein_sequence, truncating="post", padding="post", maxlen=1200)
  prediction = model.predict([tf.expand_dims(drug_padded, axis=-1), tf.expand_dims(protein_padded, axis=-1)])

  return prediction[0][0][0]  # Extract the single prediction value


In [None]:
drug = test_smiles[1520]
protein = test_proteins[1520]
label = test_labels[1520]

print(label)
prediction = predict_pKd(drug=drug, protein=protein)
print(prediction)

5.721246399047171
5.438856
