In [1]:
import tensorflow as tf
import numpy as np
import csv
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras import optimizers

# Load data
data = []  # list of lists of the form [smiles, sequence, pKd]

with open('/content/drive/MyDrive/dta_df.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # skip header
    for row in reader:
        triplet = []
        triplet.append(row[0])
        triplet.append(row[1])
        triplet.append(float(row[2]))
        data.append(triplet)

random.shuffle(data)


In [2]:

# Separate data into inputs (SMILES and proteins) and labels
smiles = [triplet[0] for triplet in data]
proteins = [triplet[1] for triplet in data]
labels = [triplet[2] for triplet in data]

# Split data into train and test sets
split = int(0.9 * len(smiles))
train_smiles = smiles[:split]
test_smiles = smiles[split:]
train_proteins = proteins[:split]
test_proteins = proteins[split:]
train_labels = labels[:split]
test_labels = labels[split:]

smiles_tokenizer = Tokenizer(char_level=True)
smiles_tokenizer.fit_on_texts(train_smiles)
train_smiles_seq = smiles_tokenizer.texts_to_sequences(train_smiles)
test_smiles_seq = smiles_tokenizer.texts_to_sequences(test_smiles)

max_smiles_len = max(len(seq) for seq in train_smiles_seq)
train_smiles_seq = pad_sequences(train_smiles_seq, maxlen=max_smiles_len)
test_smiles_seq = pad_sequences(test_smiles_seq, maxlen=max_smiles_len)

# Tokenize and pad protein sequences
protein_tokenizer = Tokenizer(char_level=True)
protein_tokenizer.fit_on_texts(train_proteins)
train_protein_seq = protein_tokenizer.texts_to_sequences(train_proteins)
test_protein_seq = protein_tokenizer.texts_to_sequences(test_proteins)

max_protein_len = max(len(seq) for seq in train_protein_seq)
train_protein_seq = pad_sequences(train_protein_seq, maxlen=max_protein_len)
test_protein_seq = pad_sequences(test_protein_seq, maxlen=max_protein_len)


In [3]:
# Convert the input data to numpy arrays
train_smiles_seq = np.array(train_smiles_seq)
train_protein_seq = np.array(train_protein_seq)
train_labels = np.array(train_labels)

test_smiles_seq = np.array(test_smiles_seq)
test_protein_seq = np.array(test_protein_seq)
test_labels = np.array(test_labels)

In [4]:
embedding_dim = 32
hidden_units = 64

smiles_input = tf.keras.Input(shape=(max_smiles_len,))
protein_input = tf.keras.Input(shape=(max_protein_len,))

smiles_embedding = tf.keras.layers.Embedding(len(smiles_tokenizer.word_index) + 1, embedding_dim)(smiles_input)
smiles_lstm = tf.keras.layers.LSTM(hidden_units)(smiles_embedding)

protein_embedding = tf.keras.layers.Embedding(len(protein_tokenizer.word_index) + 1, embedding_dim)(protein_input)
protein_lstm = tf.keras.layers.LSTM(hidden_units)(protein_embedding)

concatenated = tf.keras.layers.concatenate([smiles_lstm, protein_lstm])
dense1 = tf.keras.layers.Dense(hidden_units, activation='relu')(concatenated)
output = tf.keras.layers.Dense(1, activation='linear')(dense1)

model = tf.keras.Model(inputs=[smiles_input, protein_input], outputs=output)

# Compile and train the model
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit([train_smiles_seq, train_protein_seq], train_labels, epochs=2, batch_size=32)

# Evaluate the model on test data
test_loss = model.evaluate([test_smiles_seq, test_protein_seq], test_labels)
print("Test Loss:", test_loss)


Epoch 1/2
Epoch 2/2
Test Loss: 0.530568540096283
