In [1]:
pip install kagglehub

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import kagglehub
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download latest version
path = kagglehub.dataset_download("christang0002/davis-and-kiba")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\NongNam\.cache\kagglehub\datasets\christang0002\davis-and-kiba\versions\3


In [4]:
df = pd.read('davis.txt', sep='\t')
train , test = train_test_split(df, test_size=0.2, random_state=42)

train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

AttributeError: module 'pandas' has no attribute 'read'

In [None]:
df = pd.read_csv('test.csv')

df.columns  = ['drug_id', 'prot_id', 'drug_smile', 'prot_seq', 'label']
df.head()

Unnamed: 0,drug_id,prot_id,drug_smile,prot_seq,label
0,16722836,ARK5,CC1=CN=C(N=C1NC2=CC(=CC=C2)S(=O)(=O)NC(C)(C)C)...,MEGAAAPVAGDRPDLGLGAPGSPREAVAGATAALEPRKPHGVKRHH...,7.468521
1,4908365,PIK3CA(E545A),CN1CCN(CC1)C(=O)C2=CC3=C(N2)C=CC(=C3)Cl,TMPPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITI...,5.0
2,5291,RIPK5,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...,MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQNLR...,5.0
3,5328940,FLT3(D835H),CN1CCN(CC1)CCCOC2=C(C=C3C(=C2)N=CC(=C3NC4=CC(=...,MPALARDGGQLPLLVVFSAMIFGTITNQDLPVIKCVLINHKNNDSS...,6.0
4,24779724,PIP5K2C,CN1C=C(C=N1)C2=NN3C(=NN=C3SC4=CC5=C(C=C4)N=CC=...,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,5.481486


In [None]:
df = df.drop(columns=['drug_id', 'prot_id'])
df = df.rename(columns={'label' : 'Kd','drug_smile' : 'SMILES', 'prot_seq' : 'target_sequence'})

In [None]:
df.head()

Unnamed: 0,SMILES,target_sequence,Kd
0,CC1=CN=C(N=C1NC2=CC(=CC=C2)S(=O)(=O)NC(C)(C)C)...,MEGAAAPVAGDRPDLGLGAPGSPREAVAGATAALEPRKPHGVKRHH...,7.468521
1,CN1CCN(CC1)C(=O)C2=CC3=C(N2)C=CC(=C3)Cl,TMPPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITI...,5.0
2,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...,MEGDGVPWGSEPVSGPGPGGGGMIRELCRGFGRYRRYLGRLRQNLR...,5.0
3,CN1CCN(CC1)CCCOC2=C(C=C3C(=C2)N=CC(=C3NC4=CC(=...,MPALARDGGQLPLLVVFSAMIFGTITNQDLPVIKCVLINHKNNDSS...,6.0
4,CN1C=C(C=N1)C2=NN3C(=NN=C3SC4=CC5=C(C=C4)N=CC=...,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,5.481486


In [None]:
df.isna().sum()

SMILES             0
target_sequence    0
Kd                 0
dtype: int64

In [None]:
# 3. Encode SMILES
smiles_chars = sorted(set(''.join(df['SMILES'])))
char_to_int_smiles = {c: i + 1 for i, c in enumerate(smiles_chars)}

def encode_smiles(smiles):
    return [char_to_int_smiles[c] for c in smiles]

df['smiles_encoded'] = df['SMILES'].apply(encode_smiles)

# 4. Encode protein sequences
aa_list = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_int = {aa: i + 1 for i, aa in enumerate(aa_list)}

def encode_protein(seq):
    return [aa_to_int.get(aa, 0) for aa in seq]

df['protein_encoded'] = df['target_sequence'].apply(encode_protein)

In [None]:
max_len_smiles = 100
max_len_protein = 1000

X_smiles = pad_sequences(df['smiles_encoded'], maxlen=max_len_smiles, padding='post')
X_protein = pad_sequences(df['protein_encoded'], maxlen=max_len_protein, padding='post')
y = df['Kd'].values

In [None]:
# First split into train + test
X_s_train, X_s_val, X_p_train, X_p_val, y_train, y_val = train_test_split(
    X_smiles, X_protein, y, test_size=0.2, random_state=42
)


In [None]:
len(X_s_train), len(X_s_val)

(4809, 1203)

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping

model = load_model("model_epoch_94.h5")

# Recompile before training again
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Fine-tune the model
history = model.fit(
    [X_s_train, X_p_train], y_train,
    validation_data=([X_s_val, X_p_val], y_val),
    epochs=50,
    batch_size=32,
)



Epoch 1/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 334ms/step - loss: 0.7048 - mae: 0.5494 - val_loss: 0.6260 - val_mae: 0.4407
Epoch 2/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 318ms/step - loss: 0.5716 - mae: 0.4926 - val_loss: 0.4814 - val_mae: 0.3951
Epoch 3/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 355ms/step - loss: 0.4884 - mae: 0.4165 - val_loss: 0.5100 - val_mae: 0.3926
Epoch 4/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 342ms/step - loss: 0.4628 - mae: 0.4139 - val_loss: 0.4480 - val_mae: 0.3829
Epoch 5/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 339ms/step - loss: 0.4250 - mae: 0.4035 - val_loss: 0.4249 - val_mae: 0.3912
Epoch 6/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 342ms/step - loss: 0.3929 - mae: 0.3812 - val_loss: 0.4266 - val_mae: 0.3744
Epoch 7/50
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
model.save("model_finetuned.h5")



In [None]:
test = pd.read_csv('test.csv')
test.columns = ['drug_id', 'prot_id', 'drug_smile', 'prot_seq','label']


test = test.drop(columns=['drug_id', 'prot_id'])
test = test.rename(columns={'label' : 'Kd','drug_smile' : 'SMILES', 'prot_seq' : 'target_sequence'})

In [None]:
# 3. Encode SMILES
smiles_chars = sorted(set(''.join(test['SMILES'])))
char_to_int_smiles = {c: i + 1 for i, c in enumerate(smiles_chars)}

def encode_smiles(smiles):
    return [char_to_int_smiles[c] for c in smiles]

test['smiles_encoded'] = test['SMILES'].apply(encode_smiles)

# 4. Encode protein sequences
aa_list = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_int = {aa: i + 1 for i, aa in enumerate(aa_list)}

def encode_protein(seq):
    return [aa_to_int.get(aa, 0) for aa in seq]

test['protein_encoded'] = test['target_sequence'].apply(encode_protein)

In [None]:
max_len_smiles = 100
max_len_protein = 1000

x_smiles = pad_sequences(test['smiles_encoded'], maxlen=max_len_smiles, padding='post')
x_protein = pad_sequences(test['protein_encoded'], maxlen=max_len_protein, padding='post')
y = test['Kd'].values

In [None]:
model = load_model("model_finetuned.h5", compile=False)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
loss, mse = model.evaluate([x_smiles, x_protein], y)
print(f"Test MSE: {mse:.4f}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 84ms/step - loss: 0.2639 - mae: 0.3046
Test MSE: 0.3062
