In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install PyTorch Geometric and dependencies for PyTorch 2.0.1 + CUDA 11.8
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
!pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.1+cu118.html


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.0.1+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp311-cp311-linux_x86_64.whl (2267.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m563.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.15.2+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.15.2%2Bcu118-cp311-cp311-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.0.2
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.0.2%2Bcu118-cp311-cp311-linux_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.0.0 (from torch==2.0.1+cu118)
  Downloading https://download.pytorch.org/whl/triton-2.0.0-1-cp311-cp311-ma

In [3]:
#Setting up paths

import os
import torch
import pandas as pd
from tqdm import tqdm
from torch_geometric.data import Data

#Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")

#Define paths
project_root = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA"
data_path = f"{project_root}/data"



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

Using Device: cpu


In [5]:
#Preprocessing protein sequences

import torch
from tqdm import tqdm

#Load KIBA Dataset
kiba_df = pd.read_csv(f"{data_path}/kiba_affinity_df.csv")

#Build Amino Acid Vocabulary
AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'

AA_TO_INDEX = {aa: i + 1 for i, aa in enumerate(AMINO_ACIDS)}
MAX_LEN = 1000

#Sequence encoder function
def encode_sequence(seq, max_len=MAX_LEN):
  indices = [AA_TO_INDEX.get(aa, 0) for aa in seq[:max_len]]
  padding = [0]*(max_len - len(indices))
  return torch.tensor(indices+padding, dtype=torch.long)

#Build protein tensor dict
protein_seqs = {}

for _, row in tqdm(kiba_df.iterrows(), total=len(kiba_df)):
  prot_idx = row["Protein_Index"]
  seq = row["Sequence"]
  if prot_idx not in protein_seqs:
    protein_seqs[prot_idx] = encode_sequence(seq)

#Save to drive
torch.save(protein_seqs, f"{data_path}/kiba_protein_seqs.pt")
print(f"Saved {len(protein_seqs)} unqiue protein sequence tensors.")

100%|██████████| 118254/118254 [00:07<00:00, 16708.46it/s]

Saved 229 unqiue protein sequence tensors.



