In [2]:
%load_ext autoreload
%autoreload 2

# Fused Training

In [3]:
import torch

import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm

from druxai.models.NN_flexible import GNN_Interaction_Model
from druxai.utils.data_gnn import DrugResponseDataset, DataloaderSampler
from druxai.utils.dataframe_utils import split_data_by_cell_line_ids
from druxai.utils.dataframe_utils import standardize_molecular_data_inplace
from torch_geometric.data import Batch
file_path = "/Users/niklaskiermeyer/Desktop/Codespace/DruxAI/data/preprocessed"

In [4]:
# Load Data
data = DrugResponseDataset(file_path)
train_id, val_id, test_id = split_data_by_cell_line_ids(data.targets)
standardize_molecular_data_inplace(data, train_id=train_id, val_id=val_id, test_id=test_id)

[34mINFO    [0m Loaded targets with shape: [1m([0m[1;36m556840[0m, [1;36m9[0m[1m)[0m                                                                    
[34mINFO    [0m Loaded molecular data with shape: [1m([0m[1;36m1479[0m, [1;36m19193[0m[1m)[0m                                                           


In [5]:
data[0]

(tensor([ 0.6155, -0.2172, -0.4931,  ...,  0.1874,  0.1539, -0.3254]),
 Data(x=[17, 79], edge_index=[2, 36], edge_attr=[36, 10]),
 tensor([2.3674]),
 0)

In [6]:
def custom_collate(batch):
    gene_expression_values_batch = torch.stack([sample[0] for sample in batch])

    # Add more attributes as needed
    drug_x_batch = Batch.from_data_list([sample[1] for sample in batch])
    target_batch = torch.stack([sample[2] for sample in batch])
    idx_batch = torch.tensor([sample[3] for sample in batch])  # Convert idx to tensor

    return {
        "gene_expression_values": gene_expression_values_batch,
        "smile_graph": drug_x_batch,
        "target": target_batch,
        "idx": idx_batch
    }

In [7]:
# Dataloader Sampler
train_sampler = DataloaderSampler(train_id)

# Dataloader
train_loader = DataLoader(data, sampler=train_sampler, batch_size=256, shuffle=False, collate_fn=custom_collate)

val_sampler = DataloaderSampler(val_id)
val_loader = DataLoader(data, sampler=val_sampler, batch_size=256, shuffle=False, collate_fn=custom_collate)

In [8]:
# Model Creation
model = GNN_Interaction_Model(data, nfeatures_product=10,
                              hidden_dims_gene_expression_nn=[32],
                              dropout_gene_expression_nn=0.2)
model.train().to(torch.device("cpu"))

GNN_Interaction_Model(
  (drug_gnn): Drug_GAT(
    (conv1): GATConv(79, 128, heads=8)
    (fc): Linear(in_features=1024, out_features=10, bias=True)
  )
  (gene_expression_nn): Model(
    (model): Sequential(
      (0): Linear(in_features=19193, out_features=32, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=32, out_features=10, bias=True)
    )
  )
)

In [9]:
optimizer1 = Adam(model.drug_gnn.parameters(), lr=0.08, weight_decay=1e-5)
optimizer2 = Adam(model.gene_expression_nn.parameters(), lr=0.08, weight_decay=1e-5)

In [10]:
# Setup optimizers

epoch = 0
while epoch < 15:
    model.train()

    with tqdm(train_loader, desc=f"Epoch {epoch+1}") as t:
        for X in t:
            molecular = X["gene_expression_values"].to(torch.device("cpu"))
            smile_graphs = X["smile_graph"].to(torch.device("cpu"))
            outcome = X["target"].to(torch.device("cpu"))
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            prediction = model.forward(smile_graphs, molecular)
            loss = nn.HuberLoss()(prediction, outcome)

            loss.backward()

            clip_grad_norm_(model.parameters(), 1.0)

            optimizer1.step()
            optimizer2.step()

            t.set_postfix(loss=loss.item())

    epoch += 1

Epoch 1: 100%|██████████| 1520/1520 [03:47<00:00,  6.69it/s, loss=0.749]
Epoch 2: 100%|██████████| 1520/1520 [04:14<00:00,  5.97it/s, loss=0.72]   
Epoch 3: 100%|██████████| 1520/1520 [04:13<00:00,  5.99it/s, loss=0.447] 
Epoch 4:  25%|██▌       | 382/1520 [01:06<03:25,  5.54it/s, loss=0.507]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x106dc2160>>
Traceback (most recent call last):
  File "/Users/niklaskiermeyer/anaconda3/envs/DruxAI/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Epoch 4:  31%|███       | 468/1520 [01:19<02:29,  7.03it/s, loss=24.6] 

In [1]:
import scipy.stats

with torch.no_grad():
    model.eval()
    predictions = []
    outcomes = []
    losses = []
    for X in val_loader:
        molecular = X["gene_expression_values"].to(torch.device("cpu"))
        smile_graphs = X["smile_graph"].to(torch.device("cpu"))
        outcome = X["target"].to(torch.device("cpu"))
        prediction = model.forward(smile_graphs, molecular)
        loss = nn.HuberLoss()(prediction, outcome)

        # Convert tensors to NumPy arrays and flatten them
        prediction_np = prediction.detach().numpy().flatten()
        outcome_np = outcome.detach().numpy().flatten()

        losses.append(loss.item())
        predictions.append(prediction_np)
        outcomes.append(outcome_np)

# Calculate and print Spearman's rank correlation coefficient
spearman_r = scipy.stats.spearmanr(prediction_np, outcome_np)
print(f"Spearman's rank correlation coefficient: {spearman_r}")
print(f"Mean loss: {sum(losses) / len(losses)}")

NameError: name 'torch' is not defined