In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import tensorflow as tf
import numpy as np
import csv
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Load data
data = []  # list of lists of the form [smiles, sequence, pKd]

with open('dta_df.csv') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # skip header
    for row in reader:
        triplet = []
        triplet.append(row[0])
        triplet.append(row[1])
        triplet.append(float(row[2]))
        data.append(triplet)

random.shuffle(data)

# Separate data into inputs (SMILES and proteins) and labels
smiles = [triplet[0] for triplet in data]
proteins = [triplet[1] for triplet in data]
labels = [triplet[2] for triplet in data]

# Split data into train and test sets
split = int(0.9 * len(smiles))
train_smiles = smiles[:split]
test_smiles = smiles[split:]
train_proteins = proteins[:split]
test_proteins = proteins[split:]
train_labels = labels[:split]
test_labels = labels[split:]



In [None]:
# Load the pre-trained BERT model and tokenizer
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
# Tokenize and encode the inputs
train_encoded = tokenizer.batch_encode_plus(
    list(zip(train_smiles, train_proteins)),
    padding='max_length',
    truncation=True,
    max_length=128
)

test_encoded = tokenizer.batch_encode_plus(
    list(zip(test_smiles, test_proteins)),
    padding='max_length',
    truncation=True,
    max_length=128

)



In [None]:
train_input_ids = train_encoded['input_ids']
train_attention_masks = train_encoded['attention_mask']
train_labels = np.array(train_labels)

test_input_ids = test_encoded['input_ids']
test_attention_masks = test_encoded['attention_mask']
test_labels = np.array(test_labels)




In [None]:
# Define the model architecture
input_ids = Input(shape=(128,), dtype=tf.int32)
attention_masks = Input(shape=(128,), dtype=tf.int32)

bert_output = bert_model(input_ids, attention_mask=attention_masks)[0]
output = Dense(1)(bert_output[:, 0, :])  # Use the [CLS] token for prediction

model = Model(inputs=[input_ids, attention_masks], outputs=output)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mean_squared_error'])

# Convert input arrays to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention_masks), train_labels)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices(((test_input_ids, test_attention_masks), test_labels)).batch(32)


# Train the model
model.fit(train_dataset, epochs=5, validation_data=test_dataset)


# Evaluate the model
loss, mse = model.evaluate(test_dataset)
print("Test Loss:", loss)
print("Test Mean Squared Error:", mse)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.5027590990066528
Test Mean Squared Error: 0.5027590990066528


In [None]:
# Function to predict drug-protein pKd
smiles_tokenizer = Tokenizer(char_level=True)
smiles_tokenizer.fit_on_texts(train_smiles)

protein_tokenizer = Tokenizer(char_level=True)
protein_tokenizer.fit_on_texts(train_proteins)
def predict_pKd(drug, protein):
    drug_sequence = smiles_tokenizer.texts_to_sequences([drug])
    drug_padded = pad_sequences(drug_sequence, maxlen=128)
    protein_sequence = protein_tokenizer.texts_to_sequences([protein])
    protein_padded = pad_sequences(protein_sequence, maxlen=128)
    prediction = model.predict([drug_padded, protein_padded])
    return prediction.item()

# Example usage
drug = test_smiles[1531]
protein = test_proteins[1531]
label = test_labels[1531]

print("True label:", label)
prediction = predict_pKd(drug=drug, protein=protein)
print("Predicted pKd:", prediction)

True label: 5.0
Predicted pKd: 5.649565696716309
