In [1]:
from torch import nn
import torch

device = "cpu"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

torch.device(device)

device(type='mps')

In [2]:
import pandas as pd

rcsb_data = pd.read_csv("data/rcsb/RCSB_PDB_Macromolecular_Structure_Dataset_with_Structural_Features.csv")

In [3]:
filtered_data = rcsb_data.loc[:,["Sequence", "Number of Residues", "Molecular Weight per Deposited Model", "Molecular Weight (Entity)", "R Free", "R Work", "Helix", "Sheet", "Coil"]]
filtered_data = filtered_data.dropna(subset=["Helix", "Sheet", "Coil"])

In [4]:
def amino_acid_tokenizer(amino_acid : str) -> torch.Tensor:
    amino_acid_tokens = {
        "A": 1,
        "C": 2,
        "D": 3,
        "E": 4,
        "F": 5,
        "G": 6,
        "H": 7,
        "I": 8,
        "K": 9,
        "L": 10,
        "M": 11,
        "N": 12,
        "P": 13,
        "Q": 14,
        "R": 15,
        "S": 16,
        "T": 17,
        "V": 18,
        "W": 19,
        "Y": 20,
        "X": 21,
        "U": 22,
        "O": 23
    }
    return torch.tensor([amino_acid_tokens[aa] for aa in amino_acid], dtype = torch.int)

In [5]:
filtered_data["Sequence"] = filtered_data["Sequence"].apply(amino_acid_tokenizer)

# Padding sequences with zeros to make them all the same length
max_sequence_length = filtered_data["Sequence"].apply(len).max()
filtered_data["Sequence"] = filtered_data["Sequence"].apply(lambda x: torch.nn.functional.pad(x, (0, max_sequence_length - len(x)), "constant", 0))

In [6]:
scrambled_data = filtered_data.sample(frac=1)
scrambled_data = scrambled_data.reset_index(drop=True)

data_size = len(scrambled_data)
train_size = int(data_size * 0.8)
test_size = data_size - train_size
validation_size = int(train_size * 0.2)

train_data = scrambled_data.iloc[:train_size - validation_size]
validation_data = scrambled_data.iloc[train_size - validation_size:train_size]
test_data = scrambled_data.iloc[train_size:]

train_data = train_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [7]:
train_input_df = train_data.loc[:,["Sequence", "Number of Residues", "Molecular Weight per Deposited Model", "Molecular Weight (Entity)", "R Free", "R Work"]]
train_output_df = train_data.loc[:,["Helix", "Sheet", "Coil"]]
validation_input_df = validation_data.loc[:,["Sequence", "Number of Residues", "Molecular Weight per Deposited Model", "Molecular Weight (Entity)", "R Free", "R Work"]]
validation_output_df = validation_data.loc[:,["Helix", "Sheet", "Coil"]]
test_input_df = test_data.loc[:,["Sequence", "Number of Residues", "Molecular Weight per Deposited Model", "Molecular Weight (Entity)", "R Free", "R Work"]]
test_output_df = test_data.loc[:,["Helix", "Sheet", "Coil"]]

In [17]:
def create_dataset(input_df, output_df):
    input_tensors = torch.tensor(input_df.drop(columns=["Sequence"]).values, dtype=torch.float64)
    input_tensor_sequences = torch.stack(tuple(input_df["Sequence"].values), 0)

    output_tensors = torch.tensor(output_df.values, dtype=torch.float64)

    return torch.utils.data.TensorDataset(input_tensors, input_tensor_sequences, output_tensors)

In [18]:
train_dataset = create_dataset(train_input_df, train_output_df)
validation_dataset = create_dataset(validation_input_df, validation_output_df)
test_dataset = create_dataset(test_input_df, test_output_df)

In [24]:
training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)