## CSC413 - research project
---

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/DeepPROTACs

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1_2em3v0TLxpcrRipRJlWxC6PKaqk5biD/DeepPROTACs


In [4]:
! pip install torch
! pip install torch_geometric
! pip install rdkit
! pip install optuna
! pip install tqdm
! pip install torch-scatter -f https://pytorch-geometric.com/whl/cu111/torch-2.0.0.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/cu111/torch-2.0.0.html


In [6]:
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)

PyTorch version: 2.0.0+cu118
CUDA version: 11.8


In [5]:
import sys
import numpy as np
import torch
import os
import pickle
import logging
import torch_scatter
from pathlib import  Path
from torch import nn
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.utils.tensorboard import SummaryWriter
from protacloader2 import PROTACSet, collater
from model_Evianne2 import GraphConv, SmilesNet, ProtacModel
from train_and_test2 import train
from prepare_data2 import GraphData

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
BATCH_SIZE = 149
EPOCH = 897
TRAIN_RATE = 0.8966428653293104
LEARNING_RATE = 0.0005452702173718947
WEIGHT_DECAY = 0.00014473607817389365
TRAIN_NAME = "optuna_large_data_model_Evianne2"
root = "large_dataset/data"
for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)
logging.basicConfig(filename="/content/drive/MyDrive/DeepPROTACs/log/"+TRAIN_NAME+".log", filemode="a", level=logging.DEBUG, force=True)
logging.getLogger('RootLogger').setLevel(logging.DEBUG)

In [9]:
Path('/content/drive/MyDrive/DeepPROTACs/log').mkdir(exist_ok=True)
Path('/content/drive/MyDrive/DeepPROTACs/model').mkdir(exist_ok=True)

In [None]:
import optuna
from tqdm.notebook import tqdm


def objective(trial):

    batch_size = trial.suggest_int("batch_size", 1, 512)
    epoch = trial.suggest_int("epoch", 100, 1000)
    train_rate = trial.suggest_float("train_rate", 0.7, 0.9)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-2)

    ligase_pocket = GraphData("ligase_pocket", root)
    target_pocket = GraphData("target_pocket", root)
    PROTAC = GraphData("PROTAC", root)
    with open('small_dataset/name.pkl', 'rb') as f:
        name_list = pickle.load(f)
    label = torch.load(os.path.join(target_pocket.processed_dir, "label.pt"))

    protac_set = PROTACSet(
        name_list,
        ligase_pocket,
        target_pocket,
        PROTAC,
        label,
    )

    data_size = len(protac_set)
    train_size = int(data_size * train_rate)
    test_size = data_size - train_size
    train_dataset = torch.utils.data.Subset(protac_set, range(train_size))
    test_dataset = torch.utils.data.Subset(protac_set, range(train_size, data_size))
    trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collater, drop_last=False)
    testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collater, drop_last=False)

    ligase_pocket_model = GraphConv(num_embeddings=5)
    target_pocket_model = GraphConv(num_embeddings=5)
    PROTAC_model = GraphConv(num_embeddings=10)
    model = ProtacModel(
        ligase_pocket_model,
        target_pocket_model,
        PROTAC_model,
    )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model, train_loss, val_loss = train(
        model,
        train_loader=trainloader,
        valid_loader=testloader,
        device=device,
        batch_size=batch_size,
        epoch=epoch,
        lr=learning_rate,
        weight_decay=weight_decay
    )

    return val_loss[-1]


study = optuna.create_study(direction="minimize")

with tqdm(total=100, desc="Optimizing", ncols=80) as progress_bar:
    def callback(study, trial):
        progress_bar.update(1)

    study.optimize(objective, n_trials=100, callbacks=[callback])

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2023-04-19 01:11:08,602][0m A new study created in memory with name: no-name-94fe3385-378c-440f-9b24-70b2e83c6d2d[0m


Optimizing:   0%|                                       | 0/100 [00:00<?, ?it/s]

  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-2)
[32m[I 2023-04-19 01:11:39,009][0m Trial 0 finished with value: 0.6526243388652802 and parameters: {'batch_size': 228, 'epoch': 543, 'train_rate': 0.7549937076773736, 'learning_rate': 1.8590177409263038e-05, 'weight_decay': 4.339180614535149e-06}. Best is trial 0 with value: 0.6526243388652802.[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-2)
[32m[I 2023-04-19 01:12:06,932][0m Trial 1 finished with value: 0.6092149913311005 and parameters: {'batch_size': 149, 'epoch': 497, 'train_rate': 0.7641621880642507, 'learning_rate': 2.0425762035117108e-05, 'weight_decay': 0.005057733689640255}. Best is trial 1 with value: 0.6092149913311005.[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
  weight_decay = trial.suggest_loguniform

Best trial:
  Value: 0.3106837868690491
  Params: 
    batch_size: 149
    epoch: 897
    train_rate: 0.8966428653293104
    learning_rate: 0.0005452702173718947
    weight_decay: 0.00014473607817389365


In [None]:
def main():
  ligase_pocket = GraphData("ligase_pocket", root)
  target_pocket = GraphData("target_pocket", root)
  PROTAC = GraphData("PROTAC", root)
  with open('small_dataset/name.pkl','rb') as f:
      name_list = pickle.load(f)
  label = torch.load(os.path.join(target_pocket.processed_dir, "label.pt"))

  protac_set = PROTACSet(
      name_list,
      ligase_pocket, 
      target_pocket, 
      PROTAC, 
      label,
  )
  data_size = len(protac_set)
  train_size = int(data_size * TRAIN_RATE)
  test_size = data_size - train_size
  logging.info(f"all data: {data_size}")
  logging.info(f"train data: {train_size}")
  logging.info(f"test data: {test_size}")
  train_dataset = torch.utils.data.Subset(protac_set, range(train_size))
  test_dataset = torch.utils.data.Subset(protac_set, range(train_size, data_size))
  trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collater, drop_last=False)
  testloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collater,drop_last=False)
    
  ligase_pocket_model = GraphConv(num_embeddings=5)
  target_pocket_model = GraphConv(num_embeddings=5)
  PROTAC_model = GraphConv(num_embeddings=10)
  model = ProtacModel(
      ligase_pocket_model,
      target_pocket_model,
      PROTAC_model,
  )
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  writer = SummaryWriter(f'runs/{TRAIN_NAME}')
  model, train_loss, val_loss = train(
      model, 
      train_loader=trainloader, 
      valid_loader=testloader,
      device=device,
      writer=writer,
      LOSS_NAME=TRAIN_NAME,
      batch_size=BATCH_SIZE,
      epoch=EPOCH,
      lr=LEARNING_RATE,
      weight_decay = WEIGHT_DECAY
  )

  for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)
  return train_loss, val_loss

In [10]:
def main_large():
  root = "large_dataset/data"
  with open('large_dataset/name.pkl','rb') as f:
        name_list = pickle.load(f)
  data_size = len(name_list)/2
  train_size = int(data_size * TRAIN_RATE)
  test_size = data_size - train_size

  train_ligase_pocket = GraphData("ligase_pocket", root)[:train_size] + GraphData("ligase_pocket", root)[949:949+train_size]
  test_ligase_pocket = GraphData("ligase_pocket", root)[train_size:949] + GraphData("ligase_pocket", root)[949+train_size:]
  train_target_pocket = GraphData("target_pocket", root)[:train_size] + GraphData("target_pocket", root)[949:949+train_size]
  test_target_pocket = GraphData("target_pocket", root)[train_size:949] + GraphData("target_pocket", root)[949+train_size:]
  train_PROTAC= GraphData("PROTAC", root)[:train_size] + GraphData("PROTAC", root)[949:949+train_size]
  test_PROTAC = GraphData("PROTAC", root)[train_size:949] + GraphData("PROTAC", root)[949+train_size:]

  with open('large_dataset/name.pkl','rb') as f:
        name_list = pickle.load(f)
  train_name = name_list[:train_size] + name_list[949:949+train_size]
  test_name = name_list[train_size:949] + name_list[949+train_size:]

  label = torch.load(root+"/processed/label.pt")
  train_label = label[:train_size] + label[949:949+train_size]
  test_label = label[train_size:949] + label[949+train_size:]

  train_set = PROTACSet(
    train_name,
    train_ligase_pocket, 
    train_target_pocket, 
    train_PROTAC, 
    train_label,
  )

  valid_set = PROTACSet(
    test_name,
    test_ligase_pocket, 
    test_target_pocket, 
    test_PROTAC, 
    test_label,
  )

  data_size = len(train_set) + len(valid_set)
  train_size = len(train_set)
  test_size = len(valid_set)
  logging.info(f"all data: {data_size}")
  logging.info(f"train data: {train_size}")
  logging.info(f"test data: {test_size}")
  trainloader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collater,drop_last=False, shuffle=True)
  testloader = DataLoader(valid_set, batch_size=BATCH_SIZE, collate_fn=collater,drop_last=False, shuffle=True)


  ligase_pocket_model = GraphConv(num_embeddings=5)
  target_pocket_model = GraphConv(num_embeddings=5)
  PROTAC_model = GraphConv(num_embeddings=10)

  model = ProtacModel( 
      ligase_pocket_model,
      target_pocket_model,
      PROTAC_model,
  )
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  writer = SummaryWriter(f'runs/{TRAIN_NAME}')
  model, train_loss, val_loss = train(
      model, 
      train_loader=trainloader, 
      valid_loader=testloader,
      device=device,
      writer=writer,
      LOSS_NAME=TRAIN_NAME,
      batch_size=BATCH_SIZE,
      epoch=EPOCH,
      lr=LEARNING_RATE,
      weight_decay = WEIGHT_DECAY
  )
  for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)
  return train_loss, val_loss

In [11]:
train_loss, val_loss = main_large()

Processing...


FileNotFoundError: ignored

In [None]:
for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)



```
# This is formatted as code
  label_train = []
  for data_sample in train_dataset:
        label_train.append(data_sample["label"])
  lu_train, count_train = np.unique(label_train,return_counts=True)
  cw_train = [sum(count_train) / c for c in count_train]
  ew_train = [cw_train[e] for e in label_train]
  sampler_train = WeightedRandomSampler(weights=ew_train, num_samples = int(2*len(label_train)), replacement=True)

  label_test = []
  for data_sample in test_dataset:
        label_test.append(data_sample["label"])
  lu_test, count_test = np.unique(label_test,return_counts=True)
  cw_test = [sum(count_test) / c for c in count_test]
  ew_test = [cw_test[e] for e in label_test]
  sampler_test = WeightedRandomSampler(weights=ew_test, num_samples = int(2*len(label_test)), replacement=True)
```