In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Generate synthetic DNA-protein sequence pairs using the standard genetic code

In [None]:
# STEP 1: SYNTHETIC DATA GENERATION (Complete Standard Genetic Code)
# ============================================================================

def get_standard_genetic_code():
    """
    Returns the standard genetic code mapping (64 codons ‚Üí amino acids)

    Returns:
    --------
    genetic_code : dict
        Mapping from DNA codon (3 nucleotides) to amino acid (1 letter code)
        '*' represents stop codons
    """
    genetic_code = {
        # TTT, TTC -> Phenylalanine (F)
        'TTT': 'F', 'TTC': 'F',
        # TTA, TTG, CTT, CTC, CTA, CTG -> Leucine (L)
        'TTA': 'L', 'TTG': 'L', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
        # ATT, ATC, ATA -> Isoleucine (I)
        'ATT': 'I', 'ATC': 'I', 'ATA': 'I',
        # ATG -> Methionine (M) - Start codon
        'ATG': 'M',
        # GTT, GTC, GTA, GTG -> Valine (V)
        'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
        # TCT, TCC, TCA, TCG, AGT, AGC -> Serine (S)
        'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'AGT': 'S', 'AGC': 'S',
        # CCT, CCC, CCA, CCG -> Proline (P)
        'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
        # ACT, ACC, ACA, ACG -> Threonine (T)
        'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
        # GCT, GCC, GCA, GCG -> Alanine (A)
        'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
        # TAT, TAC -> Tyrosine (Y)
        'TAT': 'Y', 'TAC': 'Y',
        # TAA, TAG, TGA -> Stop codons (*)
        'TAA': '*', 'TAG': '*', 'TGA': '*',
        # CAT, CAC -> Histidine (H)
        'CAT': 'H', 'CAC': 'H',
        # CAA, CAG -> Glutamine (Q)
        'CAA': 'Q', 'CAG': 'Q',
        # AAT, AAC -> Asparagine (N)
        'AAT': 'N', 'AAC': 'N',
        # AAA, AAG -> Lysine (K)
        'AAA': 'K', 'AAG': 'K',
        # GAT, GAC -> Aspartic acid (D)
        'GAT': 'D', 'GAC': 'D',
        # GAA, GAG -> Glutamic acid (E)
        'GAA': 'E', 'GAG': 'E',
        # TGT, TGC -> Cysteine (C)
        'TGT': 'C', 'TGC': 'C',
        # TGG -> Tryptophan (W)
        'TGG': 'W',
        # CGT, CGC, CGA, CGG, AGA, AGG -> Arginine (R)
        'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', 'AGG': 'R',
        # GGT, GGC, GGA, GGG -> Glycine (G)
        'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G',
    }
    return genetic_code

def translate_dna_to_protein(dna_seq, genetic_code):
    """
    Translate DNA sequence to protein using standard genetic code

    Parameters:
    -----------
    dna_seq : str
        DNA sequence (A, T, G, C)
    genetic_code : dict
        Codon to amino acid mapping

    Returns:
    --------
    protein_seq : str
        Translated protein sequence (amino acids)
    """
    protein_seq = ''
    # Process in groups of 3 (codons)
    for i in range(0, len(dna_seq) - 2, 3):
        codon = dna_seq[i:i+3].upper()
        if len(codon) == 3:
            if codon in genetic_code:
                aa = genetic_code[codon]
                if aa == '*':  # Stop codon
                    break
                protein_seq += aa
            else:
                # Unknown codon (shouldn't happen with standard code)
                protein_seq += 'X'  # Unknown amino acid
    return protein_seq

def generate_dna_protein_pairs(n_sequences=5000, min_codons=10, max_codons=100):
    """
    Generate synthetic DNA-protein sequence pairs using standard genetic code

    Parameters:
    -----------
    n_sequences : int
        Number of sequence pairs to generate
    min_codons : int
        Minimum number of codons per sequence
    max_codons : int
        Maximum number of codons per sequence

    Returns:
    --------
    df : pd.DataFrame
        DataFrame with columns: dna_sequence, protein_sequence, dna_length, protein_length
    """
    print("\n" + "=" * 70)
    print("STEP 1: GENERATING SYNTHETIC DNA-PROTEIN PAIRS")
    print("=" * 70)

    nucleotides = ['A', 'T', 'G', 'C']
    genetic_code = get_standard_genetic_code()

    # Get all codons that code for amino acids (exclude stop codons)
    coding_codons = [codon for codon, aa in genetic_code.items() if aa != '*']

    sequences = []

    print(f"   Generating {n_sequences} sequence pairs...")
    print(f"   Using standard genetic code ({len(coding_codons)} coding codons)")

    for i in range(n_sequences):
        # Random number of codons
        n_codons = np.random.randint(min_codons, max_codons + 1)

        # Generate DNA sequence by selecting random codons
        # This ensures valid codon structure
        dna_seq = ''
        for _ in range(n_codons):
            codon = random.choice(coding_codons)
            dna_seq += codon

        # Translate to protein
        protein_seq = translate_dna_to_protein(dna_seq, genetic_code)

        # Only keep sequences with valid proteins
        if len(protein_seq) > 0:
            sequences.append({
                'dna_sequence': dna_seq,
                'protein_sequence': protein_seq,
                'dna_length': len(dna_seq),
                'protein_length': len(protein_seq)
            })

        if (i + 1) % 1000 == 0:
            print(f"   Generated {i + 1}/{n_sequences} sequences...")

    df = pd.DataFrame(sequences)
    print(f"\n   ‚úÖ Generated {len(df)} valid sequence pairs")
    print(f"   DNA length range: {df['dna_length'].min()} - {df['dna_length'].max()} nucleotides")
    print(f"   Protein length range: {df['protein_length'].min()} - {df['protein_length'].max()} amino acids")

    # Show examples
    print(f"\n   üìã Example sequences:")
    for idx in range(min(3, len(df))):
        dna_example = df.iloc[idx]['dna_sequence'][:60]
        protein_example = df.iloc[idx]['protein_sequence'][:20]
        print(f"      DNA: {dna_example}...")
        print(f"      Protein: {protein_example}...")
        print()

    return df

# Generate data - Increased to 20,000 sequences for better learning
df = generate_dna_protein_pairs(n_sequences=20000, min_codons=10, max_codons=100)


# save data

os.makedirs('data', exist_ok=True)
df.to_csv('data/dna_protein_pairs.csv', index=False)
print('Data saved to data/dna_protein_pairs.csv')



STEP 1: GENERATING SYNTHETIC DNA-PROTEIN PAIRS
   Generating 20000 sequence pairs...
   Using standard genetic code (61 coding codons)
   Generated 1000/20000 sequences...
   Generated 2000/20000 sequences...
   Generated 3000/20000 sequences...
   Generated 4000/20000 sequences...
   Generated 5000/20000 sequences...
   Generated 6000/20000 sequences...
   Generated 7000/20000 sequences...
   Generated 8000/20000 sequences...
   Generated 9000/20000 sequences...
   Generated 10000/20000 sequences...
   Generated 11000/20000 sequences...
   Generated 12000/20000 sequences...
   Generated 13000/20000 sequences...
   Generated 14000/20000 sequences...
   Generated 15000/20000 sequences...
   Generated 16000/20000 sequences...
   Generated 17000/20000 sequences...
   Generated 18000/20000 sequences...
   Generated 19000/20000 sequences...
   Generated 20000/20000 sequences...

   ‚úÖ Generated 20000 valid sequence pairs
   DNA length range: 30 - 300 nucleotides
   Protein length range: 1

In [4]:
df

Unnamed: 0,dna_sequence,protein_sequence,dna_length,protein_length
0,ACTATTGATTGCGCAGGAGTCACTATTGAGGGCAACGAACAATGGG...,TIDCAGVTIEGNEQWALITVNNAGIHLICASVH,99,33
1,TCGGTAAATCCGGAGTGTTCCGGCTGTACAGCGAGCACCTTGTCGC...,SVNPECSGCTASTLSPIYLNTYSTGPRGLCQIKMRGISQWAIPYGR...,270,90
2,CAGGTAAGGCACATCATTTACACCACCTTATGGTGC,QVRHIIYTTLWC,36,12
3,TTCAAGGACCGACTCCTAGGTGCTAGATGTGCGACTAATCCCACAG...,FKDRLLGARCATNPTGEHWSPKVLG,75,25
4,AACGCGATCATCATATTCACATGGAGCACAAAA,NAIIIFTWSTK,33,11
...,...,...,...,...
19995,AGACGCCATGTGATATCCGAGAATGTTAGACGGGAGCTATTTACAC...,RRHVISENVRRELFTHYIRVSSVSKALNPIRYASLNNFKAVHISQT...,204,68
19996,CGCAATGCAGTACTAGTACATAATGCTGGGACCCTCGCA,RNAVLVHNAGTLA,39,13
19997,CCCATTAACACCTCGCGACAGCATTGGTGTGCCAGAGCCCTATCGC...,PINTSRQHWCARALSPTSRLLRSALLMFIRVALTASYAFPPQHLIL...,297,99
19998,CGATCAATCCTCGCTACTATTTCTCCACCTGCTTCGCGCAGACGGG...,RSILATISPPASRRREIQPMVLPDQSRHGRRDFDTHFHEPENLGGD...,237,79


In [None]:
print("\n" + "=" * 70)
print("STEP 2: DATA PREPROCESSING AND ENCODING")
print("=" * 70)

def encode_sequences(sequences, vocab, max_length, add_start_end=False):
    """
    Encode sequences to integers

    Parameters:
    -----------
    sequences : list
        List of sequences (DNA or protein)
    vocab : dict
        Vocabulary mapping (char -> int)
    max_length : int
        Maximum sequence length (for padding)
    add_start_end : bool
        If True, add <START> at beginning and <END> at end (for protein sequences)

    Returns:
    --------
    encoded : np.array
        Encoded sequences (n_samples, max_length)
    """
    encoded = []
    for seq in sequences:
        seq_encoded = [vocab.get(char, vocab['<UNK>']) for char in seq]

        if add_start_end and '<START>' in vocab and '<END>' in vocab:
            seq_encoded = [vocab['<START>']] + seq_encoded + [vocab['<END>']]

        if len(seq_encoded) < max_length:
            seq_encoded += [vocab['<PAD>']] * (max_length - len(seq_encoded))
        else:
            seq_encoded = seq_encoded[:max_length]
        encoded.append(seq_encoded)
    return np.array(encoded)

print("\n   Creating vocabularies...")

dna_vocab = {'A': 0, 'T': 1, 'G': 2, 'C': 3, '<PAD>': 4, '<UNK>': 5}
print(f"   DNA vocab size: {len(dna_vocab)}")

amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
protein_vocab = {aa: i for i, aa in enumerate(amino_acids)}
protein_vocab['<PAD>'] = len(amino_acids)
protein_vocab['<UNK>'] = len(amino_acids) + 1
protein_vocab['<START>'] = len(amino_acids) + 2
protein_vocab['<END>'] = len(amino_acids) + 3
print(f"   Protein vocab size: {len(protein_vocab)}")

# Use 95th percentile to handle most sequences while avoiding outliers
max_dna_length = int(df['dna_length'].quantile(0.95))
max_protein_length = int(df['protein_length'].quantile(0.95))

# Round DNA length to nearest multiple of 3 to preserve codon boundaries
max_dna_length = ((max_dna_length // 3) + 1) * 3

max_protein_length_with_tokens = max_protein_length + 2

print(f"\n   Max DNA length (95th percentile): {max_dna_length}")
print(f"   Max protein length (95th percentile): {max_protein_length}")
print(f"   Max protein length (with START/END tokens): {max_protein_length_with_tokens}")

print("\n   Encoding sequences...")
X = encode_sequences(df['dna_sequence'].tolist(), dna_vocab, max_dna_length, add_start_end=False)
y = encode_sequences(df['protein_sequence'].tolist(), protein_vocab, max_protein_length_with_tokens, add_start_end=True)

print(f"   ‚úÖ Encoded {len(X)} sequences")
print(f"   DNA shape: {X.shape}")
print(f"   Protein shape: {y.shape}")

print("\n   Splitting data...")
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, shuffle=True
)

print(f"   ‚úÖ Training set: {len(X_train)} samples")
print(f"   ‚úÖ Validation set: {len(X_val)} samples")
print(f"   ‚úÖ Test set: {len(X_test)} samples")



STEP 2: DATA PREPROCESSING AND ENCODING

   Creating vocabularies...
   DNA vocab size: 6
   Protein vocab size: 24

   Max DNA length (95th percentile): 291
   Max protein length (95th percentile): 96
   Max protein length (with START/END tokens): 98

   Encoding sequences...
   ‚úÖ Encoded 20000 sequences
   DNA shape: (20000, 291)
   Protein shape: (20000, 98)
   Note: Protein sequences now include <START> and <END> tokens

   Splitting data...
   ‚úÖ Training set: 12800 samples
   ‚úÖ Validation set: 3200 samples
   ‚úÖ Test set: 4000 samples


In [6]:
print("\n" + "=" * 70)
print("CREATING DATASET AND DATALOADER")
print("=" * 70)

class DNAToProteinDataset(Dataset):
    """PyTorch Dataset for DNA-Protein pairs"""
    def __init__(self, dna_sequences, protein_sequences):
        self.dna_sequences = torch.LongTensor(dna_sequences)
        self.protein_sequences = torch.LongTensor(protein_sequences)

    def __len__(self):
        return len(self.dna_sequences)

    def __getitem__(self, idx):
        return self.dna_sequences[idx], self.protein_sequences[idx]

train_dataset = DNAToProteinDataset(X_train, y_train)
val_dataset = DNAToProteinDataset(X_val, y_val)
test_dataset = DNAToProteinDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"   ‚úÖ Created datasets")
print(f"   Training batches: {len(train_loader)}")
print(f"   Validation batches: {len(val_loader)}")
print(f"   Test batches: {len(test_loader)}")


CREATING DATASET AND DATALOADER
   ‚úÖ Created datasets
   Training batches: 400
   Validation batches: 100
   Test batches: 125


In [7]:
print("\n" + "=" * 70)
print("STEP 4: DEFINING ENCODER-DECODER MODEL")
print("=" * 70)


STEP 4: DEFINING ENCODER-DECODER MODEL


In [None]:

# ============================================================================
# MODEL 1: LSTM Encoder-Decoder
# ============================================================================
class DNAProteinTranslatorLSTM(nn.Module):
    """
    LSTM-based Encoder-Decoder model for DNA to Protein translation

    Architecture:
    - Encoder: Bidirectional LSTM that processes DNA sequence
    - Decoder: LSTM that generates protein sequence
    """
    def __init__(self, dna_vocab_size, protein_vocab_size, embedding_dim=64,
                 hidden_dim=128, num_layers=2):
        super(DNAProteinTranslatorLSTM, self).__init__()

        self.protein_vocab_size = protein_vocab_size

        # Encoder
        self.dna_embedding = nn.Embedding(dna_vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(
            embedding_dim, hidden_dim, num_layers,
            batch_first=True, bidirectional=True
        )

        # Decoder
        self.protein_embedding = nn.Embedding(protein_vocab_size, embedding_dim)
        self.decoder_lstm = nn.LSTM(
            embedding_dim, hidden_dim * 2, num_layers,  # *2 because encoder is bidirectional
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim * 2, protein_vocab_size)

    def forward(self, dna_seq, protein_seq=None, max_length=100):
        """
        Forward pass

        Parameters:
        -----------
        dna_seq : torch.Tensor
            DNA sequence (batch, seq_len)
        protein_seq : torch.Tensor, optional
            Target protein sequence (for training)
        max_length : int
            Maximum protein length (for inference)
        """
        batch_size = dna_seq.size(0)

        # Encode DNA
        dna_embedded = self.dna_embedding(dna_seq)
        encoder_out, (hidden, cell) = self.encoder_lstm(dna_embedded)

        # Prepare decoder hidden state
        # For bidirectional LSTM: hidden shape is (num_layers * 2, batch, hidden_dim)
        # We need to concatenate forward and backward for each layer
        # Then reshape to (num_layers, batch, hidden_dim * 2)
        num_layers = hidden.size(0) // 2
        batch_size = hidden.size(1)
        hidden_dim = hidden.size(2)

        # Reshape: (num_layers * 2, batch, hidden_dim) -> (num_layers, batch, hidden_dim * 2)
        hidden_forward = hidden[:num_layers]  # Forward direction
        hidden_backward = hidden[num_layers:]  # Backward direction
        hidden = torch.cat([hidden_forward, hidden_backward], dim=2)

        cell_forward = cell[:num_layers]
        cell_backward = cell[num_layers:]
        cell = torch.cat([cell_forward, cell_backward], dim=2)

        if self.training and protein_seq is not None:
            # Teacher forcing: use actual protein sequence during training
            protein_embedded = self.protein_embedding(protein_seq)
            decoder_out, _ = self.decoder_lstm(protein_embedded, (hidden, cell))
            output = self.fc(decoder_out)
            return output
        else:
            # Inference: generate one token at a time
            # START token is at index protein_vocab_size - 2, END token is at protein_vocab_size - 1
            start_token_id = self.protein_vocab_size - 2
            end_token_id = self.protein_vocab_size - 1

            outputs = []
            input_token = torch.full((batch_size, 1), start_token_id, dtype=torch.long).to(dna_seq.device)

            for step in range(max_length):
                protein_embedded = self.protein_embedding(input_token)
                decoder_out, (hidden, cell) = self.decoder_lstm(
                    protein_embedded, (hidden, cell)
                )
                output = self.fc(decoder_out)
                outputs.append(output)

                # Get predicted token
                predicted = torch.argmax(output, dim=-1)

                # Early stopping: check if all sequences in batch have generated END token
                if torch.all(predicted == end_token_id):
                    break

                input_token = predicted

            return torch.cat(outputs, dim=1)

In [None]:
# MODEL 2: RNN Encoder-Decoder
# ============================================================================
class DNAProteinTranslatorRNN(nn.Module):
    """
    RNN-based Encoder-Decoder model for DNA to Protein translation

    Architecture:
    - Encoder: Bidirectional RNN that processes DNA sequence
    - Decoder: RNN that generates protein sequence
    """
    def __init__(self, dna_vocab_size, protein_vocab_size, embedding_dim=64,
                 hidden_dim=128, num_layers=2):
        super(DNAProteinTranslatorRNN, self).__init__()

        self.protein_vocab_size = protein_vocab_size

        # Encoder
        self.dna_embedding = nn.Embedding(dna_vocab_size, embedding_dim)
        self.encoder_rnn = nn.RNN(
            embedding_dim, hidden_dim, num_layers,
            batch_first=True, bidirectional=True
        )

        # Decoder
        self.protein_embedding = nn.Embedding(protein_vocab_size, embedding_dim)
        self.decoder_rnn = nn.RNN(
            embedding_dim, hidden_dim * 2, num_layers,  # *2 because encoder is bidirectional
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim * 2, protein_vocab_size)

    def forward(self, dna_seq, protein_seq=None, max_length=100):
        batch_size = dna_seq.size(0)

        # Encode DNA
        dna_embedded = self.dna_embedding(dna_seq)
        encoder_out, hidden = self.encoder_rnn(dna_embedded)

        # Prepare decoder hidden state
        num_layers = hidden.size(0) // 2
        hidden_forward = hidden[:num_layers]
        hidden_backward = hidden[num_layers:]
        hidden = torch.cat([hidden_forward, hidden_backward], dim=2)

        if self.training and protein_seq is not None:
            # Teacher forcing
            protein_embedded = self.protein_embedding(protein_seq)
            decoder_out, _ = self.decoder_rnn(protein_embedded, hidden)
            output = self.fc(decoder_out)
            return output
        else:
            # Inference: generate one token at a time
            # START token is at index protein_vocab_size - 2, END token is at protein_vocab_size - 1
            start_token_id = self.protein_vocab_size - 2
            end_token_id = self.protein_vocab_size - 1

            outputs = []
            input_token = torch.full((batch_size, 1), start_token_id, dtype=torch.long).to(dna_seq.device)

            for step in range(max_length):
                protein_embedded = self.protein_embedding(input_token)
                decoder_out, hidden = self.decoder_rnn(protein_embedded, hidden)
                output = self.fc(decoder_out)
                outputs.append(output)

                predicted = torch.argmax(output, dim=-1)

                # Early stopping: check if all sequences in batch have generated END token
                if torch.all(predicted == end_token_id):
                    break

                input_token = predicted

            return torch.cat(outputs, dim=1)

In [None]:
# MODEL 3: GRU Encoder-Decoder
# ============================================================================
class DNAProteinTranslatorGRU(nn.Module):
    """
    GRU-based Encoder-Decoder model for DNA to Protein translation

    Architecture:
    - Encoder: Bidirectional GRU that processes DNA sequence
    - Decoder: GRU that generates protein sequence
    """
    def __init__(self, dna_vocab_size, protein_vocab_size, embedding_dim=64,
                 hidden_dim=128, num_layers=2):
        super(DNAProteinTranslatorGRU, self).__init__()

        self.protein_vocab_size = protein_vocab_size

        # Encoder
        self.dna_embedding = nn.Embedding(dna_vocab_size, embedding_dim)
        self.encoder_gru = nn.GRU(
            embedding_dim, hidden_dim, num_layers,
            batch_first=True, bidirectional=True
        )

        # Decoder
        self.protein_embedding = nn.Embedding(protein_vocab_size, embedding_dim)
        self.decoder_gru = nn.GRU(
            embedding_dim, hidden_dim * 2, num_layers,  # *2 because encoder is bidirectional
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim * 2, protein_vocab_size)

    def forward(self, dna_seq, protein_seq=None, max_length=100):
        batch_size = dna_seq.size(0)

        # Encode DNA
        dna_embedded = self.dna_embedding(dna_seq)
        encoder_out, hidden = self.encoder_gru(dna_embedded)

        # Prepare decoder hidden state
        num_layers = hidden.size(0) // 2
        hidden_forward = hidden[:num_layers]
        hidden_backward = hidden[num_layers:]
        hidden = torch.cat([hidden_forward, hidden_backward], dim=2)

        if self.training and protein_seq is not None:
            # Teacher forcing
            protein_embedded = self.protein_embedding(protein_seq)
            decoder_out, _ = self.decoder_gru(protein_embedded, hidden)
            output = self.fc(decoder_out)
            return output
        else:
            # Inference: generate one token at a time
            # START token is at index protein_vocab_size - 2, END token is at protein_vocab_size - 1
            start_token_id = self.protein_vocab_size - 2
            end_token_id = self.protein_vocab_size - 1

            outputs = []
            input_token = torch.full((batch_size, 1), start_token_id, dtype=torch.long).to(dna_seq.device)

            for step in range(max_length):
                protein_embedded = self.protein_embedding(input_token)
                decoder_out, hidden = self.decoder_gru(protein_embedded, hidden)
                output = self.fc(decoder_out)
                outputs.append(output)

                predicted = torch.argmax(output, dim=-1)

                # Early stopping: check if all sequences in batch have generated END token
                if torch.all(predicted == end_token_id):
                    break

                input_token = predicted

            return torch.cat(outputs, dim=1)

In [None]:
# MODEL 4: Transformer Encoder-Decoder
# ============================================================================
class DNAProteinTranslatorTransformer(nn.Module):
    """
    Transformer-based Encoder-Decoder model for DNA to Protein translation

    Architecture:
    - Encoder: Transformer Encoder with self-attention
    - Decoder: Transformer Decoder with self-attention and cross-attention
    """
    def __init__(self, dna_vocab_size, protein_vocab_size, embedding_dim=128,
                 num_heads=8, num_layers=3, dim_feedforward=512, max_seq_length=300):
        super(DNAProteinTranslatorTransformer, self).__init__()

        self.protein_vocab_size = protein_vocab_size
        self.embedding_dim = embedding_dim
        self.max_seq_length = max_seq_length

        # Encoder
        self.dna_embedding = nn.Embedding(dna_vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.randn(1, max_seq_length, embedding_dim))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=num_heads,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Decoder
        self.protein_embedding = nn.Embedding(protein_vocab_size, embedding_dim)
        self.pos_decoder = nn.Parameter(torch.randn(1, max_seq_length, embedding_dim))

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embedding_dim,
            nhead=num_heads,
            dim_feedforward=dim_feedforward,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embedding_dim, protein_vocab_size)

    def forward(self, dna_seq, protein_seq=None, max_length=100):
        batch_size = dna_seq.size(0)
        dna_len = dna_seq.size(1)

        # Encode DNA
        dna_embedded = self.dna_embedding(dna_seq)
        dna_embedded = dna_embedded + self.pos_encoder[:, :dna_len, :]
        encoder_out = self.encoder(dna_embedded)

        if self.training and protein_seq is not None:
            # Teacher forcing
            protein_len = protein_seq.size(1)
            protein_embedded = self.protein_embedding(protein_seq)
            protein_embedded = protein_embedded + self.pos_decoder[:, :protein_len, :]

            # Create causal mask for decoder (prevents looking at future tokens)
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(protein_len).to(dna_seq.device)

            decoder_out = self.decoder(
                protein_embedded, encoder_out,
                tgt_mask=tgt_mask
            )
            output = self.fc(decoder_out)
            return output
        else:
            # Inference: generate one token at a time
            # START token is at index protein_vocab_size - 2, END token is at protein_vocab_size - 1
            start_token_id = self.protein_vocab_size - 2
            end_token_id = self.protein_vocab_size - 1

            outputs = []
            input_sequence = torch.full((batch_size, 1), start_token_id, dtype=torch.long).to(dna_seq.device)
            all_finished = torch.zeros(batch_size, dtype=torch.bool).to(dna_seq.device)

            for step in range(max_length):
                # Embed the current sequence
                protein_embedded = self.protein_embedding(input_sequence)
                # Add positional encoding for the current sequence length
                seq_len = input_sequence.size(1)
                protein_embedded = protein_embedded + self.pos_decoder[:, :seq_len, :]

                # Create causal mask for current sequence length
                tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(dna_seq.device)

                # Decode
                decoder_out = self.decoder(
                    protein_embedded, encoder_out,
                    tgt_mask=tgt_mask
                )
                output = self.fc(decoder_out)
                # Get prediction for the last position
                outputs.append(output[:, -1:, :])

                # Get predicted token and append to sequence
                predicted = torch.argmax(output[:, -1:, :], dim=-1)

                # Track which sequences have finished (predicted END token)
                all_finished = all_finished | (predicted.squeeze(1) == end_token_id)

                # Early stopping: if all sequences in batch have generated END token
                if torch.all(all_finished):
                    break

                input_sequence = torch.cat([input_sequence, predicted], dim=1)

            return torch.cat(outputs, dim=1)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\n   Using device: {device}")


   Using device: cuda


In [None]:
print("\n" + "=" * 70)
print(" TRAINING MULTIPLE MODELS AND COMPARISON")
print("=" * 70)

def decode_sequence(encoded_seq, vocab):
    """Decode encoded sequence back to string"""
    reverse_vocab = {v: k for k, v in vocab.items()}
    seq = ''.join([reverse_vocab.get(int(idx), '') for idx in encoded_seq])
    seq = seq.replace('<PAD>', '').replace('<UNK>', '').replace('<START>', '').replace('<END>', '')
    return seq

model_configs = {
    'RNN': {
        'class': DNAProteinTranslatorRNN,
        'embedding_dim': 128,
        'hidden_dim': 256,
        'num_layers': 3
    },
    'LSTM': {
        'class': DNAProteinTranslatorLSTM,
        'embedding_dim': 128,
        'hidden_dim': 256,
        'num_layers': 3
    },
    'GRU': {
        'class': DNAProteinTranslatorGRU,
        'embedding_dim': 128,
        'hidden_dim': 256,
        'num_layers': 3
    },
    'Transformer': {
        'class': DNAProteinTranslatorTransformer,
        'embedding_dim': 256,
        'num_heads': 8,
        'num_layers': 4,
        'dim_feedforward': 1024,
        'max_seq_length': max_dna_length
    }
}



 TRAINING MULTIPLE MODELS AND COMPARISON


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=protein_vocab['<PAD>'])
num_epochs = 50
models = {}
results = {}

class EarlyStopping:
    """
    Early stopping to stop training when validation loss doesn't improve.

    Parameters:
    -----------
    patience : int
        Number of epochs to wait for improvement before stopping
    min_delta : float
        Minimum change to qualify as an improvement
    restore_best_weights : bool
        If True, restore model weights from best epoch
    verbose : bool
        If True, print messages when patience counter updates
    """
    def __init__(self, patience=10, min_delta=0.001, restore_best_weights=True, verbose=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.verbose = verbose
        self.best_score = None
        self.counter = 0
        self.early_stop = False
        self.best_weights = None

    def __call__(self, val_loss, model):
        """
        Check if early stopping should be triggered.

        Parameters:
        -----------
        val_loss : float
            Current validation loss
        model : nn.Module
            Model to save weights from
        """
        score = -val_loss  # Negative because lower loss is better

        if self.best_score is None:
            # First validation
            self.best_score = score
            self.save_checkpoint(model)
        elif score < self.best_score + self.min_delta:
            # No improvement
            self.counter += 1
            if self.verbose:
                print(f'   ‚ö†Ô∏è  EarlyStopping counter: {self.counter}/{self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            # Improvement found
            self.best_score = score
            self.save_checkpoint(model)
            self.counter = 0

    def save_checkpoint(self, model):
        """Save model weights when improvement is found."""
        if self.restore_best_weights:
            self.best_weights = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    def load_best_weights(self, model):
        """Restore model weights from best epoch."""
        if self.restore_best_weights and self.best_weights:
            model.load_state_dict(self.best_weights)
            if self.verbose:
                print(f'   ‚úÖ Restored best model weights (val_loss: {-self.best_score:.4f})')

In [None]:
# Train each model
for model_name, config in model_configs.items():
    print(f"\n{'='*70}")
    print(f"Training {model_name} Model")
    print(f"{'='*70}")

    # Initialize model
    if model_name == 'Transformer':
        model = config['class'](
            dna_vocab_size=len(dna_vocab),
            protein_vocab_size=len(protein_vocab),
            embedding_dim=config['embedding_dim'],
            num_heads=config['num_heads'],
            num_layers=config['num_layers'],
            dim_feedforward=config['dim_feedforward'],
            max_seq_length=config['max_seq_length']
        ).to(device)
    else:
        model = config['class'](
            dna_vocab_size=len(dna_vocab),
            protein_vocab_size=len(protein_vocab),
            embedding_dim=config['embedding_dim'],
            hidden_dim=config['hidden_dim'],
            num_layers=config['num_layers']
        ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-6
    )

    early_stopping = EarlyStopping(patience=10, min_delta=0.001, verbose=True)

    print(f"   Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"   Training for up to {num_epochs} epochs (with early stopping)...")

    # Training loop
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0

        for dna_seq, protein_seq in train_loader:
            dna_seq = dna_seq.to(device)
            protein_seq = protein_seq.to(device)

            optimizer.zero_grad()
            output = model(dna_seq, protein_seq)

            # Shift target by 1 since output[i] predicts token at position i+1
            seq_len = min(output.size(1), protein_seq.size(1))
            output_shifted = output[:, :seq_len-1, :].contiguous()
            target_shifted = protein_seq[:, 1:seq_len].contiguous()

            output_flat = output_shifted.view(-1, output_shifted.size(-1))
            target_flat = target_shifted.view(-1)

            loss = criterion(output_flat, target_flat)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for dna_seq, protein_seq in val_loader:
                dna_seq = dna_seq.to(device)
                protein_seq = protein_seq.to(device)

                output = model(dna_seq, protein_seq)

                seq_len = min(output.size(1), protein_seq.size(1))
                output_shifted = output[:, :seq_len-1, :].contiguous()
                target_shifted = protein_seq[:, 1:seq_len].contiguous()

                output_flat = output_shifted.view(-1, output_shifted.size(-1))
                target_flat = target_shifted.view(-1)

                loss = criterion(output_flat, target_flat)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        scheduler.step(avg_val_loss)
        current_lr = optimizer.param_groups[0]['lr']

        early_stopping(avg_val_loss, model)

        print(f'   Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {current_lr:.6f}')

        if early_stopping.early_stop:
            print(f'   ‚õî Early stopping triggered at epoch {epoch+1}')
            break

    early_stopping.load_best_weights(model)

    print(f"\n   Evaluating {model_name} on test set...")
    model.eval()
    test_loss = 0
    correct_predictions = 0
    total_tokens = 0

    with torch.no_grad():
        for dna_seq, protein_seq in test_loader:
            dna_seq = dna_seq.to(device)
            protein_seq = protein_seq.to(device)

            output = model(dna_seq, max_length=max_protein_length_with_tokens)

            target_shifted = protein_seq[:, 1:].contiguous()

            min_len = min(output.size(1), target_shifted.size(1))
            output = output[:, :min_len, :]
            target_shifted = target_shifted[:, :min_len]

            output_flat = output.contiguous().view(-1, output.size(-1))
            target_flat = target_shifted.contiguous().view(-1)

            loss = criterion(output_flat, target_flat)
            test_loss += loss.item()

            pred_tokens = torch.argmax(output_flat, dim=-1)
            mask = (target_flat != protein_vocab['<PAD>']) & (target_flat != protein_vocab['<START>'])
            correct_predictions += (pred_tokens[mask] == target_flat[mask]).sum().item()
            total_tokens += mask.sum().item()

    avg_test_loss = test_loss / len(test_loader)
    accuracy = correct_predictions / total_tokens if total_tokens > 0 else 0

    models[model_name] = model
    results[model_name] = {
        'test_loss': avg_test_loss,
        'accuracy': accuracy,
        'correct': correct_predictions,
        'total': total_tokens
    }

    print(f"   ‚úÖ {model_name} training complete!")
    print(f"   Test Loss: {avg_test_loss:.4f}")
    print(f"   Accuracy: {accuracy*100:.2f}% ({correct_predictions}/{total_tokens} tokens)")


Training RNN Model
   Total parameters: 2,381,592
   Training for up to 50 epochs (with early stopping)...
   Epoch [1/50], Train Loss: 2.7796, Val Loss: 2.7118, LR: 0.001000
   Epoch [2/50], Train Loss: 2.6951, Val Loss: 2.6727, LR: 0.001000
   Epoch [3/50], Train Loss: 2.6688, Val Loss: 2.6595, LR: 0.001000
   ‚ö†Ô∏è  EarlyStopping counter: 1/10
   Epoch [4/50], Train Loss: 2.6587, Val Loss: 2.6639, LR: 0.001000
   Epoch [5/50], Train Loss: 2.6560, Val Loss: 2.6491, LR: 0.001000
   Epoch [6/50], Train Loss: 2.6463, Val Loss: 2.6344, LR: 0.001000
   Epoch [7/50], Train Loss: 2.6331, Val Loss: 2.6330, LR: 0.001000
   Epoch [8/50], Train Loss: 2.6260, Val Loss: 2.6226, LR: 0.001000
   Epoch [9/50], Train Loss: 2.6190, Val Loss: 2.6126, LR: 0.001000
   ‚ö†Ô∏è  EarlyStopping counter: 1/10
   Epoch [10/50], Train Loss: 2.6281, Val Loss: 2.6169, LR: 0.001000
   ‚ö†Ô∏è  EarlyStopping counter: 2/10
   Epoch [11/50], Train Loss: 2.6152, Val Loss: 2.6176, LR: 0.001000
   ‚ö†Ô∏è  EarlyStopping 

In [17]:
# Compare models
print("\n" + "=" * 70)
print("MODEL COMPARISON RESULTS")
print("=" * 70)
print(f"\n{'Model':<15} {'Test Loss':<15} {'Accuracy':<15} {'Correct/Total':<20}")
print("-" * 65)
for model_name, result in results.items():
    print(f"{model_name:<15} {result['test_loss']:<15.4f} {result['accuracy']*100:<15.2f}% {result['correct']}/{result['total']:<15}")



MODEL COMPARISON RESULTS

Model           Test Loss       Accuracy        Correct/Total       
-----------------------------------------------------------------
RNN             2.5753          20.16          % 45258/224480         
LSTM            1.3033          64.43          % 144626/224480         
GRU             2.0885          49.30          % 110677/224480         
Transformer     2.9073          9.65           % 21655/224480         


In [18]:
# Find best model
best_model_name = min(results.keys(), key=lambda x: results[x]['test_loss'])
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test Loss: {results[best_model_name]['test_loss']:.4f}")
print(f"   Accuracy: {results[best_model_name]['accuracy']*100:.2f}%")


üèÜ Best Model: LSTM
   Test Loss: 1.3033
   Accuracy: 64.43%


In [19]:
# Show example predictions from best model
print(f"\nüìã Example predictions from {best_model_name}:")
best_model = models[best_model_name]
best_model.eval()
with torch.no_grad():
    sample_indices = [0, 1, 2]
    for idx in sample_indices:
        test_dna = X_test[idx:idx+1]
        test_protein = y_test[idx:idx+1]
        test_dna_tensor = torch.LongTensor(test_dna).to(device)
        predictions = best_model(test_dna_tensor, max_length=max_protein_length_with_tokens)
        pred_tokens = torch.argmax(predictions, dim=-1).cpu().numpy()[0]

        true_protein = decode_sequence(test_protein[0], protein_vocab)
        pred_protein = decode_sequence(pred_tokens, protein_vocab)

        print(f"\n   Example {idx + 1}:")
        print(f"   True:  {true_protein[:60]}...")
        print(f"   Pred:  {pred_protein[:60]}...")

print("\n   ‚úÖ All models trained and compared!")


üìã Example predictions from LSTM:

   Example 1:
   True:  INRAASSFRSQYDGTAGSDDCRELSRSGFSIAIWIRLVYDLLKQRFRRPADALITQKVYY...
   Pred:  INRASSAFRSQYDGTASDANCRELSRGSGSIIAWIFLMDVLLKQRFREPANALITQKVYT...

   Example 2:
   True:  LCFPILEHPAPQVASIGATLSPEERLTACTPILFHARLNAI...
   Pred:  LCFPILEHPAPQVASIGATLSSERLETACTPILFHARANLRN...

   Example 3:
   True:  GQFRWLAEGLQSIRKYCRLTTALISKTMTLKPYTLLRSVRRWHPHYAIGNGRHRLCTQE...
   Pred:  GQFGWLAELLQSIRKRYRTLTAILSKTMTKLPYTLLRSVRWPHRYRAIGNGRHWLCTQEQ...

   ‚úÖ All models trained and compared!


In [None]:
print("\n" + "=" * 70)
print("STEP 6: SAVING BEST MODEL")
print("=" * 70)

os.makedirs('model', exist_ok=True)

best_model = models[best_model_name]
model_path = f'model/dna_protein_translator_{best_model_name.lower()}.pth'
if best_model_name == 'Transformer':
    model_config = {
        'model_type': 'Transformer',
        'embedding_dim': model_configs[best_model_name]['embedding_dim'],
        'num_heads': model_configs[best_model_name]['num_heads'],
        'num_layers': model_configs[best_model_name]['num_layers'],
        'dim_feedforward': model_configs[best_model_name]['dim_feedforward'],
        'max_seq_length': model_configs[best_model_name]['max_seq_length']
    }
else:
    model_config = {
        'model_type': best_model_name,
        'embedding_dim': model_configs[best_model_name]['embedding_dim'],
        'hidden_dim': model_configs[best_model_name]['hidden_dim'],
        'num_layers': model_configs[best_model_name]['num_layers']
    }

torch.save({
    'model_state_dict': best_model.state_dict(),
    'model_type': best_model_name,
    'dna_vocab': dna_vocab,
    'protein_vocab': protein_vocab,
    'max_dna_length': max_dna_length,
    'max_protein_length': max_protein_length,
    'max_protein_length_with_tokens': max_protein_length_with_tokens,
    'model_config': model_config,
    'results': results
}, model_path)

print(f"   ‚úÖ Best model ({best_model_name}) saved to: {model_path}")
print(f"   All model results saved in checkpoint")



STEP 6: SAVING BEST MODEL
   ‚úÖ Best model (LSTM) saved to: model/dna_protein_translator_lstm.pth
   All model results saved in checkpoint


In [None]:
# INFERENCE FUNCTION
# ============================================================================

def predict_protein(dna_sequence, model, dna_vocab, protein_vocab,
                    max_dna_length, max_protein_length_with_tokens, device='cpu'):
    """
    Predict protein sequence from DNA sequence

    Parameters:
    -----------
    dna_sequence : str
        DNA sequence (A, T, G, C)
    model : DNAProteinTranslator
        Trained model
    dna_vocab : dict
        DNA vocabulary
    protein_vocab : dict
        Protein vocabulary
    max_dna_length : int
        Maximum DNA length
    max_protein_length_with_tokens : int
        Maximum protein length (including START/END tokens)
    device : str
        Device to run inference on

    Returns:
    --------
    protein_sequence : str
        Predicted protein sequence (without START/END tokens)
    """
    model.eval()

    dna_encoded = [dna_vocab.get(char, dna_vocab['<UNK>']) for char in dna_sequence.upper()]
    if len(dna_encoded) < max_dna_length:
        dna_encoded += [dna_vocab['<PAD>']] * (max_dna_length - len(dna_encoded))
    else:
        dna_encoded = dna_encoded[:max_dna_length]

    dna_tensor = torch.LongTensor([dna_encoded]).to(device)

    with torch.no_grad():
        output = model(dna_tensor, max_length=max_protein_length_with_tokens)
        pred_tokens = torch.argmax(output, dim=-1).cpu().numpy()[0]

    reverse_protein_vocab = {v: k for k, v in protein_vocab.items()}
    protein_sequence = ''.join([reverse_protein_vocab.get(int(idx), '')
                                for idx in pred_tokens])

    protein_sequence = protein_sequence.replace('<PAD>', '').replace('<UNK>', '')
    protein_sequence = protein_sequence.replace('<START>', '').replace('<END>', '')

    return protein_sequence

print("\n" + "=" * 70)
print("STEP 7: TESTING INFERENCE")
print("=" * 70)

checkpoint = torch.load(model_path, map_location=device)

try:
    checkpoint_max_dna_length = checkpoint.get('max_dna_length', max_dna_length)
    checkpoint_max_protein_length_with_tokens = checkpoint.get('max_protein_length_with_tokens', max_protein_length_with_tokens)
    checkpoint_dna_vocab = checkpoint.get('dna_vocab', dna_vocab)
    checkpoint_protein_vocab = checkpoint.get('protein_vocab', protein_vocab)
except NameError:
    checkpoint_max_dna_length = checkpoint['max_dna_length']
    checkpoint_max_protein_length_with_tokens = checkpoint['max_protein_length_with_tokens']
    checkpoint_dna_vocab = checkpoint['dna_vocab']
    checkpoint_protein_vocab = checkpoint['protein_vocab']
model_type = checkpoint['model_type']
model_config = checkpoint['model_config']

if model_type == 'Transformer':
    inference_model = DNAProteinTranslatorTransformer(
        dna_vocab_size=len(checkpoint_dna_vocab),
        protein_vocab_size=len(checkpoint_protein_vocab),
        embedding_dim=model_config['embedding_dim'],
        num_heads=model_config['num_heads'],
        num_layers=model_config['num_layers'],
        dim_feedforward=model_config['dim_feedforward'],
        max_seq_length=model_config['max_seq_length']
    ).to(device)
elif model_type == 'LSTM':
    inference_model = DNAProteinTranslatorLSTM(
        dna_vocab_size=len(checkpoint_dna_vocab),
        protein_vocab_size=len(checkpoint_protein_vocab),
        embedding_dim=model_config['embedding_dim'],
        hidden_dim=model_config['hidden_dim'],
        num_layers=model_config['num_layers']
    ).to(device)
elif model_type == 'RNN':
    inference_model = DNAProteinTranslatorRNN(
        dna_vocab_size=len(checkpoint_dna_vocab),
        protein_vocab_size=len(checkpoint_protein_vocab),
        embedding_dim=model_config['embedding_dim'],
        hidden_dim=model_config['hidden_dim'],
        num_layers=model_config['num_layers']
    ).to(device)
elif model_type == 'GRU':
    inference_model = DNAProteinTranslatorGRU(
        dna_vocab_size=len(checkpoint_dna_vocab),
        protein_vocab_size=len(checkpoint_protein_vocab),
        embedding_dim=model_config['embedding_dim'],
        hidden_dim=model_config['hidden_dim'],
        num_layers=model_config['num_layers']
    ).to(device)

inference_model.load_state_dict(checkpoint['model_state_dict'])
inference_model.eval()

print(f"   ‚úÖ Loaded {model_type} model from checkpoint")

test_indices = [0, 1, 2]
print("\n   Testing inference on sample sequences:")
for idx in test_indices:
    test_dna = df.iloc[idx]['dna_sequence']
    true_protein = df.iloc[idx]['protein_sequence']
    pred_protein = predict_protein(
        test_dna, inference_model, checkpoint_dna_vocab, checkpoint_protein_vocab,
        checkpoint_max_dna_length, checkpoint_max_protein_length_with_tokens, device
    )

    print(f"\n   Example {idx + 1}:")
    print(f"   DNA:     {test_dna[:60]}...")
    print(f"   True:    {true_protein[:50]}...")
    print(f"   Pred:    {pred_protein[:50]}...")


STEP 7: TESTING INFERENCE
   ‚úÖ Loaded LSTM model from checkpoint

   Testing inference on sample sequences:

   Example 1:
   DNA:     ACTATTGATTGCGCAGGAGTCACTATTGAGGGCAACGAACAATGGGCTCTAATCACCGTC...
   True:    TIDCAGVTIEGNEQWALITVNNAGIHLICASVH...
   Pred:    TIDCAGVTIDEKGQWEAILVNTAGIHLICASVH...

   Example 2:
   DNA:     TCGGTAAATCCGGAGTGTTCCGGCTGTACAGCGAGCACCTTGTCGCCCATTTACTTAAAC...
   True:    SVNPECSGCTASTLSPIYLNTYSTGPRGLCQIKMRGISQWAIPYGRSTQD...
   Pred:    SVNPECSGCATSSLTPYNYYTLSGIRGLCQIKIIRGSLQMAIGGISTPQD...

   Example 3:
   DNA:     CAGGTAAGGCACATCATTTACACCACCTTATGGTGC...
   True:    QVRHIIYTTLWC...
   Pred:    QVRHIIYTHLWC...


In [22]:
print("\n" + "=" * 70)
print("‚úÖ PROJECT 5 COMPLETE!")
print("=" * 70)
print("\n   Model saved and ready for use!")
print(f"   Model path: {model_path}")
print("\n   To use the model for new predictions:")
print("   1. Load the checkpoint")
print("   2. Use predict_protein() function with your DNA sequence")


‚úÖ PROJECT 5 COMPLETE!

   Model saved and ready for use!
   Model path: model/dna_protein_translator_lstm.pth

   To use the model for new predictions:
   1. Load the checkpoint
   2. Use predict_protein() function with your DNA sequence
