In [21]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim

In [22]:
# Define the alphabet of amino acids and secondary structure
alphabet = "ACDEFGHIKLMNPQRSTVWXYZ*"
struct_alphabet = "CEH"

# Define a function to encode the sequences and structures as one-hot vectors
def one_hot_encode(seq, struct):
    seq_enc = np.zeros((len(seq), len(alphabet)), dtype=np.float32)
    struct_enc = np.zeros((len(seq), len(struct_alphabet)), dtype=np.float32)
    for i, (aa, ss) in enumerate(zip(seq, struct)):
        seq_enc[i, alphabet.index(aa)] = 1
        struct_enc[i, struct_alphabet.index(ss)] = 1
    return seq_enc, struct_enc

one_hot_encode("ACDEFGHIKLMNPQRSTVWXYZ*", "CEH")

(array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
 

In [23]:
# Define a function to split the data into training, validation, and test sets
def split_data(data, ratio=(0.8, 0.1, 0.1)):
    n = len(data)
    n_train = int(n * ratio[0])
    n_val = int(n * ratio[1])
    n_test = n - n_train - n_val
    idx = np.random.permutation(n)
    train_idx = idx[:n_train]
    val_idx = idx[n_train:n_train+n_val]
    test_idx = idx[n_train+n_val:]
    train_data = [data[i] for i in train_idx]
    val_data = [data[i] for i in val_idx]
    test_data = [data[i] for i in test_idx]
    return train_data, val_data, test_data


In [24]:
# Define the model architecture
class SecondaryStructureModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SecondaryStructureModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        return self.net(x)

In [25]:
data = pd.read_csv("./Data/2018-06-06-ss.cleaned.csv")
sequences = data.iloc[:, 2].values
structures = data.iloc[:, 4].values

encoded_data = []
for i in range(len(sequences)):
    seq_enc, struct_enc = one_hot_encode(sequences[i], structures[i])
    encoded_data.append((seq_enc, struct_enc))

print(encoded_data)
    