## CSC413 - research project
---

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/DeepPROTACs

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1_2em3v0TLxpcrRipRJlWxC6PKaqk5biD/DeepPROTACs


In [2]:
! pip install torch
! pip install torch_geometric
! pip install rdkit
! pip install torch-scatter -f https://pytorch-geometric.com/whl/cu111/torch-2.0.0.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/cu111/torch-2.0.0.html


In [3]:
import sys
import numpy as np
import torch
import os
import pickle
import logging
from pathlib import  Path
from torch import nn
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.utils.tensorboard import SummaryWriter
from protacloader2 import PROTACSet, collater
from model_Evianne2 import GraphConv, SmilesNet, ProtacModel
from train_and_test2 import train
from prepare_data2 import GraphData
import torch_scatter

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
BATCH_SIZE = 149
EPOCH = 897
TRAIN_RATE = 0.8
LEARNING_RATE = 0.0005452702173718947
WEIGHT_DECAY = 0.00014473607817389365
TRAIN_NAME = "final_model_b149_e897_t0.8_l0.0054_w0.0014_weighted_sample"

for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)
logging.basicConfig(filename="/content/drive/MyDrive/DeepPROTACs/log/"+TRAIN_NAME+".log", filemode="a", level=logging.DEBUG, force=True)
logging.getLogger('RootLogger').setLevel(logging.DEBUG)

In [6]:
Path('/content/drive/MyDrive/DeepPROTACs/log').mkdir(exist_ok=True)
Path('/content/drive/MyDrive/DeepPROTACs/model').mkdir(exist_ok=True)

In [7]:
def main_small():
  root = "small_dataset/data"
  ligase_pocket = GraphData("ligase_pocket", root)
  target_pocket = GraphData("target_pocket", root)
  PROTAC = GraphData("PROTAC", root)
  with open('small_dataset/name.pkl','rb') as f:
      name_list = pickle.load(f)
  label = torch.load(os.path.join(target_pocket.processed_dir, "label.pt"))

  protac_set = PROTACSet(
      name_list,
      ligase_pocket, 
      target_pocket, 
      PROTAC, 
      label,
  )
  data_size = len(protac_set)
  train_size = int(data_size * TRAIN_RATE)
  test_size = data_size - train_size

  train_dataset = torch.utils.data.Subset(protac_set, range(train_size))
  test_dataset = torch.utils.data.Subset(protac_set, range(train_size, data_size))

  label_train = []
  for data_sample in train_dataset:
        label_train.append(data_sample["label"])
  lu_train, count_train = np.unique(label_train,return_counts=True)
  cw_train = [sum(count_train) / c for c in count_train]
  ew_train = [cw_train[e] for e in label_train]
  sampler_train = WeightedRandomSampler(weights=ew_train, num_samples = 1800, replacement=True)

  label_test = []
  for data_sample in test_dataset:
        label_test.append(data_sample["label"])
  lu_test, count_test = np.unique(label_test,return_counts=True)
  cw_test = [sum(count_test) / c for c in count_test]
  ew_test = [cw_test[e] for e in label_test]
  sampler_test = WeightedRandomSampler(weights=ew_test, num_samples = 500, replacement=True)

  trainloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler_train, collate_fn=collater, drop_last=False)
  testloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=sampler_test, collate_fn=collater,drop_last=False)


  logging.info(f"all data: {data_size}")
  logging.info(f"train data: {len(trainloader.dataset)}")
  logging.info(f"test data: {len(testloader.dataset)}")

  ligase_pocket_model = GraphConv(num_embeddings=5)
  target_pocket_model = GraphConv(num_embeddings=5)
  PROTAC_model = GraphConv(num_embeddings=10)
  model = ProtacModel(
      ligase_pocket_model,
      target_pocket_model,
      PROTAC_model,
  )
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  writer = SummaryWriter(f'runs/{TRAIN_NAME}')
  model, train_loss, val_loss = train(
      model, 
      train_loader=trainloader, 
      valid_loader=testloader,
      device=device,
      writer=writer,
      LOSS_NAME=TRAIN_NAME,
      batch_size=BATCH_SIZE,
      epoch=EPOCH,
      lr=LEARNING_RATE,
      weight_decay = WEIGHT_DECAY
  )

  for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)
  return train_loss, val_loss

def main_large():
  root = "large_dataset/data"
  with open('large_dataset/name.pkl','rb') as f:
        name_list = pickle.load(f)
  data_size = len(name_list)/2
  train_size = int(data_size * TRAIN_RATE)
  test_size = data_size - train_size

  train_ligase_pocket = GraphData("ligase_pocket", root)[:train_size] + GraphData("ligase_pocket", root)[949:949+train_size]
  test_ligase_pocket = GraphData("ligase_pocket", root)[train_size:949] + GraphData("ligase_pocket", root)[949+train_size:]
  train_target_pocket = GraphData("target_pocket", root)[:train_size] + GraphData("target_pocket", root)[949:949+train_size]
  test_target_pocket = GraphData("target_pocket", root)[train_size:949] + GraphData("target_pocket", root)[949+train_size:]
  train_PROTAC= GraphData("PROTAC", root)[:train_size] + GraphData("PROTAC", root)[949:949+train_size]
  test_PROTAC = GraphData("PROTAC", root)[train_size:949] + GraphData("PROTAC", root)[949+train_size:]

  with open('large_dataset/name.pkl','rb') as f:
        name_list = pickle.load(f)
  train_name = name_list[:train_size] + name_list[949:949+train_size]
  test_name = name_list[train_size:949] + name_list[949+train_size:]

  label = torch.load(root+"/processed/label.pt")
  train_label = label[:train_size] + label[949:949+train_size]
  test_label = label[train_size:949] + label[949+train_size:]

  train_set = PROTACSet(
    train_name,
    train_ligase_pocket, 
    train_target_pocket, 
    train_PROTAC, 
    train_label,
  )

  valid_set = PROTACSet(
    test_name,
    test_ligase_pocket, 
    test_target_pocket, 
    test_PROTAC, 
    test_label,
  )

  data_size = len(train_set) + len(valid_set)
  train_size = len(train_set)
  test_size = len(valid_set)
  logging.info(f"all data: {data_size}")
  logging.info(f"train data: {train_size}")
  logging.info(f"test data: {test_size}")
  trainloader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collater,drop_last=False, shuffle=True)
  testloader = DataLoader(valid_set, batch_size=BATCH_SIZE, collate_fn=collater,drop_last=False, shuffle=True)


  ligase_pocket_model = GraphConv(num_embeddings=5)
  target_pocket_model = GraphConv(num_embeddings=5)
  PROTAC_model = GraphConv(num_embeddings=10)

  model = ProtacModel( 
      ligase_pocket_model,
      target_pocket_model,
      PROTAC_model,
  )
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  writer = SummaryWriter(f'runs/{TRAIN_NAME}')
  model, train_loss, val_loss = train(
      model, 
      train_loader=trainloader, 
      valid_loader=testloader,
      device=device,
      writer=writer,
      LOSS_NAME=TRAIN_NAME,
      batch_size=BATCH_SIZE,
      epoch=EPOCH,
      lr=LEARNING_RATE,
      weight_decay = WEIGHT_DECAY
  )
  for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)
  return train_loss, val_loss

In [8]:
#comment out which one you DON'T want to run
train_loss, val_loss = main_small()
#train_loss, val_loss = main_large()

In [9]:
for handler in logging.root.handlers[:]:
      logging.root.removeHandler(handler)

In [10]:
from train_and_test2 import test
BATCH_SIZE=1
root = "test_dataset/data"
ligase_pocket = GraphData("ligase_pocket", root)
target_pocket = GraphData("target_pocket", root)
PROTAC = GraphData("PROTAC", root)
with open('test_dataset/name.pkl','rb') as f:
    name_list = pickle.load(f)
label = torch.load(os.path.join(target_pocket.processed_dir, "label.pt"))

test_set = PROTACSet(
    name_list,
    ligase_pocket, 
    target_pocket, 
    PROTAC, 
    label,
)
testloader = DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=collater,drop_last=False)

ligase_pocket_model = GraphConv(num_embeddings=5)
target_pocket_model = GraphConv(num_embeddings=5)
PROTAC_model = GraphConv(num_embeddings=10)

model = ProtacModel(
    ligase_pocket_model,
    target_pocket_model,
    PROTAC_model,
)
model = torch.load("model/final_model_b149_e897_t0.8_l0.0054_w0.0014_weighted_sample.pt")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_loss, val_acc, auroc, y_pred, y_true = test(model, testloader, device=device)

In [11]:
print(val_acc)
print(auroc)
print(y_pred)
print(y_true)

0.6875
0.6363636363636365
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0]




```
# This is formatted as code
  label_train = []
  for data_sample in train_dataset:
        label_train.append(data_sample["label"])
  lu_train, count_train = np.unique(label_train,return_counts=True)
  cw_train = [sum(count_train) / c for c in count_train]
  ew_train = [cw_train[e] for e in label_train]
  sampler_train = WeightedRandomSampler(weights=ew_train, num_samples = int(2*len(label_train)), replacement=True)

  label_test = []
  for data_sample in test_dataset:
        label_test.append(data_sample["label"])
  lu_test, count_test = np.unique(label_test,return_counts=True)
  cw_test = [sum(count_test) / c for c in count_test]
  ew_test = [cw_test[e] for e in label_test]
  sampler_test = WeightedRandomSampler(weights=ew_test, num_samples = int(2*len(label_test)), replacement=True)
```