In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

#Define project path
project_root = "/content/drive/MyDrive/Rebuilding_and_Modifying_GraphDTA"
data_path = f"{project_root}/data"

#Load the data into a dataframe
proteins_df = pd.read_csv(f"{data_path}/proteins.csv")
proteins_df.head()

Unnamed: 0,Protein_Index,Accession_Number,Gene_Name,Sequence
0,0,NP_055726.3,AAK1,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...
1,1,NP_005148.2,ABL1(E255K)-phosphorylated,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...
2,3,NP_005148.2,ABL1(F317I)-phosphorylated,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...
3,5,NP_005148.2,ABL1(F317L)-phosphorylated,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...
4,7,NP_005148.2,ABL1(H396P)-phosphorylated,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...


In [None]:
#Character index mapping
#Amino acid vocabulary
AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'
AA_TO_IDX = {aa: idx+1 for idx, aa in enumerate(AMINO_ACIDS)}

def sequence_to_tensor(seq, max_len=1000):
  """Converts protein squence to index tensor."""
  seq = seq[:max_len] #Trimming
  idxs = [AA_TO_IDX.get(aa, 0) for aa in seq] #Unknown AAs become 0
  return torch.tensor(idxs, dtype = torch.long)

In [None]:
#Convert protein sequences from dataframe to indexed tensors
import torch
from tqdm import tqdm

protein_tensors = {}

for i, row in tqdm(proteins_df.iterrows(), total = len(proteins_df)):
  idx = row['Protein_Index']
  seq = row['Sequence']
  tensor = sequence_to_tensor(seq)
  protein_tensors[idx] = tensor

100%|██████████| 433/433 [00:00<00:00, 3714.62it/s]


In [None]:
#Save Protein sequence tensors
torch.save(protein_tensors, f"{data_path}/davis_protein_tensors.pt")
print(f"Saved {len(protein_tensors)} protein tensors.")

Saved 433 protein tensors.


In [None]:
#check
loaded_proteins = torch.load(f"{data_path}/davis_protein_tensors.pt")
print(f"Example: Protein 0 \n{loaded_proteins[0]}")

Example: Protein 0 
tensor([11,  9,  9,  5,  5,  3, 16, 15, 15,  4, 14,  6,  6, 16,  6, 10,  6, 16,
         6, 16, 16,  6,  6,  6,  6, 16, 17, 16,  6, 10,  6, 16,  6, 20,  8,  6,
        15, 18,  5,  6,  8,  6, 15, 14, 14, 18, 17, 18,  3,  4, 18, 10,  1,  4,
         6,  6,  5,  1,  8, 18,  5, 10, 18, 15, 17, 16, 12,  6, 11,  9,  2,  1,
        10,  9, 15, 11,  5, 18, 12, 12,  4,  7,  3, 10, 14, 18,  2,  9, 15,  4,
         8, 14,  8, 11, 15,  3, 10, 16,  6,  7,  9, 12,  8, 18,  6, 20,  8,  3,
        16, 16,  8, 12, 12, 18, 16, 16,  6,  3, 18, 19,  4, 18, 10,  8, 10, 11,
         3,  5,  2, 15,  6,  6, 14, 18, 18, 12, 10, 11, 12, 14, 15, 10, 14, 17,
         6,  5, 17,  4, 12,  4, 18, 10, 14,  8,  5,  2,  3, 17,  2,  4,  1, 18,
         1, 15, 10,  7, 14,  2,  9, 17, 13,  8,  8,  7, 15,  3, 10,  9, 18,  4,
        12,  8, 10, 10,  7,  3, 15,  6,  7, 20, 18, 10,  2,  3,  5,  6, 16,  1,
        17, 12,  9,  5, 14, 12, 13, 14, 17,  4,  6, 18, 12,  1, 18,  4,  3,  4,
         8,  9,  9, 