In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install PyTorch Geometric and dependencies for PyTorch 2.0.1 + CUDA 11.8
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.1+cu118.html


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.0.1+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp311-cp311-linux_x86_64.whl (2267.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m918.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.15.2+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.15.2%2Bcu118-cp311-cp311-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m123.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.0.2
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.0.2%2Bcu118-cp311-cp311-linux_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m114.4 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.0.0 (from torch==2.0.1+cu118)
  Downloading https://download.pytorch.org/whl/triton-2.0.0-1-cp311-cp311-

In [3]:
import torch
import pandas as pd

#Set Paths

project_root = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA"
data_path = f"{project_root}/data"
drug_graph_path = f"{data_path}/davis_drugs_graph.pt"
protein_graph_path = f"{data_path}/davis_protein_graphs.pt"
affinity_csv_path = f"{data_path}/drug_protein_affinity.csv"

#Load .pt graph dictionaries
drug_graphs = torch.load(drug_graph_path)
protein_graphs = torch.load(protein_graph_path)

#Load affinity table
affinity_df = pd.read_csv(affinity_csv_path)

valid_protein_ids = set(protein_graphs.keys())
print(f"✅ Available protein graphs: {len(valid_protein_ids)}")


# Filter rows where protein graph exists
filtered_affinity_df = affinity_df[affinity_df["Protein_Index"].apply(lambda x: int(x) in valid_protein_ids)].reset_index(drop=True)
print(f"✅ Filtered affinity entries: {len(filtered_affinity_df)} (from {len(affinity_df)})")

# Update affinity DataFrame
affinity_df = filtered_affinity_df

#Display
print(f"Loaded {len(drug_graphs)} drug graphs")
print(f"Loaded {len(protein_graphs)} protein graphs")
print(f"Loaded {len(affinity_df)} affinity entries")
affinity_df.head()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

✅ Available protein graphs: 255
✅ Filtered affinity entries: 16728 (from 29444)
Loaded 68 drug graphs
Loaded 255 protein graphs
Loaded 16728 affinity entries


Unnamed: 0,Drug_Index,Protein_Index,Affinity
0,0,0,7.366532
1,0,1,5.0
2,0,3,5.0
3,0,5,5.0
4,0,7,5.0


In [4]:
import random
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from torch_geometric.data import Batch

def set_seed(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [5]:
#Define dataset class

from torch.utils.data import Dataset

class DTADataset(Dataset):
    def __init__(self, affinity_df, drug_graphs, protein_graphs):
        self.data = []
        for _, row in affinity_df.iterrows():
            d_idx = row["Drug_Index"]
            p_idx = row["Protein_Index"]
            y = row["Affinity"]

            if d_idx in drug_graphs and p_idx in protein_graphs:
                drug_graph = drug_graphs[d_idx]
                protein_graph = protein_graphs[p_idx]
                self.data.append((drug_graph, protein_graph, torch.tensor([y], dtype=torch.float)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]





In [6]:
#Define the collate function

from torch_geometric.data import Batch

def collate_fn(batch):
  drug_graphs, prot_graphs, affinities = zip(*batch)
  drug_batch = Batch.from_data_list(drug_graphs)
  protein_batch = Batch.from_data_list(prot_graphs)
  labels = torch.stack(affinities).squeeze()
  return drug_batch, protein_batch, labels


In [7]:
split_generator = torch.Generator().manual_seed(42)

In [8]:
#Create dataset splits

from torch.utils.data import DataLoader, random_split

full_dataset = DTADataset(affinity_df, drug_graphs, protein_graphs)

#train/val/test split
n = len(full_dataset)
train_size = int(0.8*n)
val_size = int(0.1*n)
test_size = n - train_size - val_size

train_set, val_set, test_set = random_split(full_dataset, [train_size, val_size, test_size], generator=split_generator)

#Dataloaders

train_loader = DataLoader(train_set, batch_size=512, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=512, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=512, shuffle=False, collate_fn=collate_fn)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_max_pool

class GraphDTA3D_Configurable(nn.Module):
    def __init__(self,
                 atom_dim=83,          # Input dimension for drug atom features
                 residue_dim=20,       # Input dimension for protein residue features (e.g., one-hot AA)
                 # Drug GCN layers configuration
                 drug_gcn_hidden_dims=[64, 128, 128],
                 # Protein GCN layers configuration
                 protein_gcn_hidden_dims=[64, 128, 128],
                 # MLP (fusion) layers configuration
                 mlp_hidden_dims=[1024, 512],
                 dropout_rate=0.3,
                 output_dim_final=1):  # Final output dimension (1 for regression)
        super(GraphDTA3D_Configurable, self).__init__()

        # Drug Encoder (GCN)
        self.drug_gcn_layers = nn.ModuleList()
        current_dim_drug = atom_dim
        for hidden_dim in drug_gcn_hidden_dims:
            self.drug_gcn_layers.append(GCNConv(current_dim_drug, hidden_dim))
            current_dim_drug = hidden_dim
        # The embedding dimension for the drug after pooling will be drug_gcn_hidden_dims[-1]

        # Protein Encoder (GCN)
        self.protein_gcn_layers = nn.ModuleList()
        current_dim_protein = residue_dim
        for hidden_dim in protein_gcn_hidden_dims:
            self.protein_gcn_layers.append(GCNConv(current_dim_protein, hidden_dim))
            current_dim_protein = hidden_dim
        # The embedding dimension for the protein after pooling will be protein_gcn_hidden_dims[-1]

        # Fusion MLP
        # Input to fc1_combined is the sum of the output dimensions from the encoders
        combined_input_dim = drug_gcn_hidden_dims[-1] + protein_gcn_hidden_dims[-1]

        self.fc1_combined = nn.Linear(combined_input_dim, mlp_hidden_dims[0])
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2_combined = nn.Linear(mlp_hidden_dims[0], mlp_hidden_dims[1])
        self.dropout2 = nn.Dropout(dropout_rate)
        self.out_layer = nn.Linear(mlp_hidden_dims[1], output_dim_final)

        self.relu = nn.ReLU()

    def forward(self, drug_data, protein_data):
        # Drug encoder
        x_drug, edge_index_drug, batch_drug = drug_data.x, drug_data.edge_index, drug_data.batch
        for layer in self.drug_gcn_layers:
            x_drug = self.relu(layer(x_drug, edge_index_drug))

        drug_emb = global_max_pool(x_drug, batch_drug)

        # Protein encoder
        x_protein, edge_index_protein, batch_protein = protein_data.x, protein_data.edge_index, protein_data.batch
        for layer in self.protein_gcn_layers:
            x_protein = self.relu(layer(x_protein, edge_index_protein))
            # Optionally, add dropout here as well
        protein_emb = global_max_pool(x_protein, batch_protein)

        # Fusion and output
        combined_emb = torch.cat([drug_emb, protein_emb], dim=1)

        x = self.relu(self.fc1_combined(combined_emb))
        x = self.dropout1(x)
        x = self.relu(self.fc2_combined(x))
        x = self.dropout2(x)
        output = self.out_layer(x)

        # Squeeze the last dimension if it's 1 (for regression tasks)
        if output.shape[-1] == 1:
            output = output.squeeze(-1)
        return output

In [10]:
#Define valuation metrics
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
import math

#Define RMSE (PyTorch)
def rmse_torch(pred, true):
    return torch.sqrt(torch.mean((pred - true) ** 2)).item()

#Define CI(PyTorch)
def concordance_index_torch(y_true, y_pred):
    """Returns CI (pure PyTorch)"""
    concordant = 0.0
    permissible = 0.0
    n = len(y_true)

    for i in range(n):
        for j in range(i + 1, n):
            if y_true[i] != y_true[j]:
                permissible += 1
                if (y_pred[i] - y_pred[j]) * (y_true[i] - y_true[j]) > 0:
                    concordant += 1
    return concordant / permissible if permissible != 0 else 0.0


In [11]:
#Define training and evaluation functions
def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for drug_batch, prot_batch, y in loader:
        drug_batch = drug_batch.to(device)
        prot_batch = prot_batch.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        output = model(drug_batch, prot_batch)
        loss = criterion(output.squeeze(), y.squeeze())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, device, loader):
    model.eval()
    y_true, y_pred = [], []
    total_val_loss = 0
    num_samples = 0
    with torch.no_grad():
        for drug_batch, prot_batch, y in loader:
            drug_batch, prot_batch = drug_batch.to(device), prot_batch.to(device)
            y = y.to(device)
            output = model(drug_batch, prot_batch)

            loss = criterion(output, y)

            total_val_loss += loss.item() * y.size(0)
            num_samples += y.size(0)

            y_pred.extend(output.detach().cpu().tolist())
            y_true.extend(y.detach().squeeze().tolist())

    y_pred_tensor = torch.tensor(y_pred)
    y_true_tensor = torch.tensor(y_true)

    avg_val_loss = total_val_loss / num_samples

    metrics =  {
        'val_loss': avg_val_loss,
        'rmse': rmse_torch(y_pred_tensor, y_true_tensor),
        'ci': concordance_index_torch(y_true, y_pred)
    }
    return metrics


In [12]:
# Check for GPU (e.g., Colab with CUDA)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")


✅ Using device: cuda


In [13]:
#Training runs
model = GraphDTA3D_Configurable(atom_dim=83,
                 residue_dim=20,
                 drug_gcn_hidden_dims=[64, 128, 128],
                 protein_gcn_hidden_dims=[64, 128, 128],
                 mlp_hidden_dims=[1024, 512],
                 dropout_rate=0.3,
                 output_dim_final=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.MSELoss()

EPOCHS = 500
best_val_ci = 0.0
best_epoch = 0

for epoch in range(1,EPOCHS + 1):
    train_loss = train(model, device, train_loader, optimizer, criterion)
    val_metrics = evaluate(model, device, val_loader)
    print(f"Epoch {epoch:02d} | Loss: {train_loss:.4f} | Val RMSE: {val_metrics['rmse']:.4f} | CI: {val_metrics['ci']:.4f}")
    if val_metrics['ci'] > best_val_ci:
        best_val_ci = val_metrics['ci']
        best_epoch = epoch

        # Save the model's state dictionary (weights)
        torch.save(model.state_dict(), f"{data_path}/davis_GraphDTA_3D_best_model_weights.pt")

        # Save the entire model
        torch.save(model, f"{data_path}/davis_GraphDTA_3D_best_model.pt")


print("\nGraphDTA-3D Training complete.")
print(f"Best GraphDTA-3D model on Davis saved from Epoch {best_epoch} with Val CI: {best_val_ci:.4f}")


Epoch 01 | Loss: 11.6007 | Val RMSE: 1.5633 | CI: 0.5557
Epoch 02 | Loss: 1.4609 | Val RMSE: 0.9479 | CI: 0.5670
Epoch 03 | Loss: 1.0658 | Val RMSE: 0.9189 | CI: 0.5814
Epoch 04 | Loss: 1.0122 | Val RMSE: 0.9034 | CI: 0.6043
Epoch 05 | Loss: 0.9758 | Val RMSE: 0.8894 | CI: 0.6355
Epoch 06 | Loss: 0.9912 | Val RMSE: 0.8750 | CI: 0.6670
Epoch 07 | Loss: 0.9466 | Val RMSE: 0.8673 | CI: 0.6890
Epoch 08 | Loss: 0.9129 | Val RMSE: 0.8539 | CI: 0.7052
Epoch 09 | Loss: 0.9097 | Val RMSE: 0.8486 | CI: 0.7196
Epoch 10 | Loss: 0.8706 | Val RMSE: 0.8340 | CI: 0.7268
Epoch 11 | Loss: 0.8496 | Val RMSE: 0.8229 | CI: 0.7357
Epoch 12 | Loss: 0.8409 | Val RMSE: 0.8175 | CI: 0.7439
Epoch 13 | Loss: 0.8430 | Val RMSE: 0.8079 | CI: 0.7499
Epoch 14 | Loss: 0.8284 | Val RMSE: 0.8105 | CI: 0.7602
Epoch 15 | Loss: 0.8097 | Val RMSE: 0.8059 | CI: 0.7649
Epoch 16 | Loss: 0.8079 | Val RMSE: 0.7849 | CI: 0.7715
Epoch 17 | Loss: 0.7814 | Val RMSE: 0.7775 | CI: 0.7759
Epoch 18 | Loss: 0.7992 | Val RMSE: 0.7797 | CI

In [14]:
#Testing the performance
best_model = torch.load(f"{data_path}/davis_GraphDTA_3D_best_model.pt", map_location=device)
test_metrics = evaluate(best_model, device, test_loader)

print(f"Davis Test Set Performance of Best GraphDTA-3D Model: \n"
      f"  Val Loss (on test): {test_metrics['val_loss']:.4f}\n" # evaluate_model_3d returns 'val_loss'
      f"  RMSE: {test_metrics['rmse']:.4f}\n"
      f"  CI:   {test_metrics['ci']:.4f}")

Davis Test Set Performance of Best GraphDTA-3D Model: 
  Val Loss (on test): 0.2990
  RMSE: 0.5468
  CI:   0.8810


In [15]:
import json
import os

def save_metrics(metrics: dict, output_path: str = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/davis_graphdta_3d_test_metrics.json"):
    """
    Saves the evaluation metrics to a JSON file.

    Parameters:
    - metrics (dict): Dictionary of performance metrics (e.g., RMSE, CI, Loss)
    - output_path (str): File path where metrics will be saved
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"[✓] Metrics saved to {output_path}")

In [16]:
# After computing your test set metrics
test_metrics = {
    "Dataset": "Davis",
    "Epoch": 498,
    "Loss (on val)": 0.1974,
    "Val RMSE": 0.5109,
    "Val CI": 0.8874,
    "Loss (on test)": 0.2990,
    "Test RMSE": 0.5468,
    "Test CI": 0.8810
}

save_metrics(test_metrics, "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/davis_graphdta_3d_test_metrics.json")

[✓] Metrics saved to /content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA/data/davis_graphdta_3d_test_metrics.json
