In [2]:
import torch
from transformers import AutoTokenizer
import re

# Reading and Preprocessing the Dataset
fasta_file = r'C:\Users\Sukirtha\Downloads\astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.fa.txt'

# Define valid amino acids for cleaning
valid_amino_acids = set('ACDEFGHIKLMNPQRSTVWYBXZ')  # Use uppercase

def clean_sequence(sequence):
    # Convert sequence to uppercase before filtering
    sequence = sequence.upper()
    
    # Only keep valid amino acids in the sequence
    cleaned_sequence = ''.join([aa for aa in sequence if aa in valid_amino_acids])
    
    if len(cleaned_sequence) == 0:
        print(f"Warning: Sequence cleaned to empty string. Original: {sequence}")
        print(f"Unique characters in problematic sequence: {set(sequence)}")  # Print unique characters
    
    return cleaned_sequence

def read_fasta(fasta_file):
    sequences = []
    labels = []
    cleaned_sequences = []
    
    with open(fasta_file, 'r') as file:
        sequence = ''
        label = ''
        
        for line in file:
            if line.startswith('>'):
                if sequence:  # Append previous sequence before reading a new one
                    cleaned_sequence = clean_sequence(sequence)
                    if len(cleaned_sequence) == 0:
                        print(f"Problematic sequence: {sequence[:50]}...")  # Print first 50 chars of problematic sequences
                    sequences.append(sequence)
                    cleaned_sequences.append(cleaned_sequence)
                    fold_class = extract_fold_class(label)  # Extract fold class as label
                    labels.append(fold_class)
                label = line.strip()
                sequence = ''
            else:
                sequence += line.strip()
        
        if sequence:  # Append the last sequence
            cleaned_sequence = clean_sequence(sequence)
            if len(cleaned_sequence) == 0:
                print(f"Problematic sequence: {sequence[:50]}...")
            sequences.append(sequence)
            cleaned_sequences.append(cleaned_sequence)
            fold_class = extract_fold_class(label)  # Extract fold class as label
            labels.append(fold_class)
    
    return cleaned_sequences, labels

# Extract fold class from label
def extract_fold_class(label):
    match = re.search(r'([a-z]\.\d+\.\d+\.\d+)', label)
    if match:
        return match.group(1)
    return None

# Insert spaces between amino acids in the cleaned sequence
def add_spaces_to_sequence(sequence):
    return ' '.join(list(sequence))

cleaned_sequences, labels = read_fasta(fasta_file)

# Preview cleaned sequences and labels
print("Total Cleaned Sequences:", len(cleaned_sequences))
if len(cleaned_sequences) > 0:
    print("First Cleaned Sequence:", cleaned_sequences[0])
    print("First Label (Fold Class):", labels[0])

# Tokenize the sequences using ProtBERT
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)

# Set max length for tokenization
MAX_LEN = 512  # Adjust this based on your GPU memory capacity

# Add spaces between amino acids for tokenization
spaced_sequence = add_spaces_to_sequence(cleaned_sequences[0])

# Tokenize the spaced sequence
if len(spaced_sequence) > 0:
    encoded_input = tokenizer(spaced_sequence, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN)
    print("Encoded Input for First Cleaned Sequence:", encoded_input)
    decoded_sequence = tokenizer.decode(encoded_input['input_ids'][0], skip_special_tokens=True)
    print("Decoded First Cleaned Sequence:", decoded_sequence)
else:
    print("Warning: First cleaned sequence is empty after cleaning.")

# Testing with a valid sequence of standard amino acids
test_sequence = "ACDEFGHIKLMNPQRSTVWY"  # Example of a valid sequence
spaced_test_sequence = add_spaces_to_sequence(test_sequence)
encoded_input = tokenizer(spaced_test_sequence, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN)
print("Encoded Input for Valid Test Sequence:", encoded_input)
decoded_test_sequence = tokenizer.decode(encoded_input['input_ids'][0], skip_special_tokens=True)
print("Decoded Valid Test Sequence:", decoded_test_sequence)

# Display Tokenizer Vocabulary Size
print("Tokenizer Vocabulary Size:", len(tokenizer.get_vocab()))


Total Cleaned Sequences: 15177
First Cleaned Sequence: SLFEQLGGQAAVQAVTAQFYANIQADATVATFFNGIDMPNQTNKTAAFLCAALGGPNAWTGRNLKEVHANMGVSNAQFTTVIGHLRSALTGAGVAAALVEQTVAVAETVRGDVVTV
First Label (Fold Class): a.1.1.1
Encoded Input for First Cleaned Sequence: {'input_ids': tensor([[ 2, 10,  5, 19,  9, 18,  5,  7,  7, 18,  6,  6,  8, 18,  6,  8, 15,  6,
         18, 19, 20,  6, 17, 11, 18,  6, 14,  6, 15,  8,  6, 15, 19, 19, 17,  7,
         11, 14, 21, 16, 17, 18, 15, 17, 12, 15,  6,  6, 19,  5, 23,  6,  6,  5,
          7,  7, 16, 17,  6, 24, 15,  7, 13, 17,  5, 12,  9,  8, 22,  6, 17, 21,
          7,  8, 10, 17,  6, 18, 19, 15, 15,  8, 11,  7, 22,  5, 13, 10,  6,  5,
         15,  7,  6,  7,  8,  6,  6,  6,  5,  8,  9, 18, 15,  8,  6,  8,  6,  9,
         15,  8, 13,  7, 14,  8,  8, 15,  8,  3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0,

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
import re
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Reading and Preprocessing the Dataset
fasta_file = r'C:\Users\Sukirtha\Downloads\astral-scopedom-seqres-gd-sel-gs-bib-40-2.08.fa.txt'
valid_amino_acids = set('ACDEFGHIKLMNPQRSTVWYBXZ')

def clean_sequence(sequence):
    sequence = sequence.upper()
    cleaned_sequence = ''.join([aa for aa in sequence if aa in valid_amino_acids])
    return cleaned_sequence

def read_fasta(fasta_file):
    sequences, labels, cleaned_sequences = [], [], []
    with open(fasta_file, 'r') as file:
        sequence, label = '', ''
        for line in file:
            if line.startswith('>'):
                if sequence:
                    cleaned_sequence = clean_sequence(sequence)
                    sequences.append(sequence)
                    cleaned_sequences.append(cleaned_sequence)
                    fold_class = extract_fold_class(label)
                    labels.append(fold_class)
                label = line.strip()
                sequence = ''
            else:
                sequence += line.strip()
        if sequence:
            cleaned_sequence = clean_sequence(sequence)
            sequences.append(sequence)
            cleaned_sequences.append(cleaned_sequence)
            fold_class = extract_fold_class(label)
            labels.append(fold_class)
    return cleaned_sequences, labels

def extract_fold_class(label):
    match = re.search(r'([a-z]\.\d+\.\d+\.\d+)', label)
    return match.group(1) if match else None

def add_spaces_to_sequence(sequence):
    return ' '.join(list(sequence))

# Load data
cleaned_sequences, labels = read_fasta(fasta_file)
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
label_to_id = {label: idx for idx, label in enumerate(label_encoder.classes_)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

class ProteinDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer, max_len):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = add_spaces_to_sequence(self.sequences[idx])
        inputs = self.tokenizer(sequence, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_len)
        input_ids = inputs['input_ids'].squeeze()
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, label

# Model Parameters
input_size = len(tokenizer)
hidden_size = 512
num_layers = 3
num_classes = len(label_to_id)
dropout_rate = 0.3
num_epochs = 150
learning_rate = 0.0001
batch_size = 32
max_len = 512

# Data Loaders
dataset = ProteinDataset(cleaned_sequences, encoded_labels, tokenizer, max_len)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define Embedding Layer Size based on Tokenizer Vocabulary
embedding_dim = 128  # Adjust as needed

# Define Bidirectional LSTM Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes, dropout_rate):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, bidirectional=True, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # *2 for bidirectional

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        batch_size = x.size(0)
        h0 = torch.zeros(num_layers * 2, batch_size, hidden_size).to(x.device)
        c0 = torch.zeros(num_layers * 2, batch_size, hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Define Bidirectional GRU Model
class BiGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes, dropout_rate):
        super(BiGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, bidirectional=True, dropout=dropout_rate, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        batch_size = x.size(0)
        h0 = torch.zeros(num_layers * 2, batch_size, hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Training Function
def train_model(model, data_loader, num_epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        total_loss, correct, total = 0, 0, 0
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        accuracy = correct / total
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")

# Initialize and Train Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm_model = BiLSTM(input_size, embedding_dim, hidden_size, num_layers, num_classes, dropout_rate).to(device)
gru_model = BiGRU(input_size, embedding_dim, hidden_size, num_layers, num_classes, dropout_rate).to(device)

print("Training Bidirectional LSTM...")
train_model(lstm_model, data_loader, num_epochs, learning_rate)

print("\nTraining Bidirectional GRU...")
train_model(gru_model, data_loader, num_epochs, learning_rate)


Training Bidirectional LSTM...
Epoch 1/150, Loss: 0.6242, Accuracy: 0.5000
Epoch 2/150, Loss: 0.6047, Accuracy: 0.5024
Epoch 3/150, Loss: 0.6784, Accuracy: 0.5048
Epoch 4/150, Loss: 0.6914, Accuracy: 0.5072
Epoch 5/150, Loss: 0.5684, Accuracy: 0.5096
Epoch 6/150, Loss: 0.5402, Accuracy: 0.5120
Epoch 7/150, Loss: 0.6390, Accuracy: 0.5144
Epoch 8/150, Loss: 0.5358, Accuracy: 0.5168
Epoch 9/150, Loss: 0.5050, Accuracy: 0.5192
Epoch 10/150, Loss: 0.6867, Accuracy: 0.5216
Epoch 11/150, Loss: 0.5473, Accuracy: 0.5240
Epoch 12/150, Loss: 0.5149, Accuracy: 0.5264
Epoch 13/150, Loss: 0.6840, Accuracy: 0.5288
Epoch 14/150, Loss: 0.6036, Accuracy: 0.5312
Epoch 15/150, Loss: 0.6830, Accuracy: 0.5336
Epoch 16/150, Loss: 0.5251, Accuracy: 0.5360
Epoch 17/150, Loss: 0.5093, Accuracy: 0.5384
Epoch 18/150, Loss: 0.6385, Accuracy: 0.5408
Epoch 19/150, Loss: 0.6253, Accuracy: 0.5432
Epoch 20/150, Loss: 0.6639, Accuracy: 0.5456
Epoch 21/150, Loss: 0.6532, Accuracy: 0.5480
Epoch 22/150, Loss: 0.6772, Accur