# RNA Secondary Structure Prediction (EDA & Training)
This notebook demonstrates RNA sequence data processing, exploratory data analysis, and training of a CNN model to predict secondary structure using dot-bracket notation.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os


## Load RNA Data

In [None]:
def parse_three_line_txt(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    data = []
    for i in range(0, len(lines), 3):
        if lines[i].startswith('>') and i + 2 < len(lines):
            seq = lines[i+1].strip().upper()
            struct = lines[i+2].strip()
            if set(seq).issubset({'A', 'U', 'G', 'C'}) and len(seq) == len(struct):
                data.append((seq, struct))
    return data

data = parse_three_line_txt("data/sample_bprna_100.txt")
print(f"Loaded {len(data)} RNA sequences")
print(data[:2])


## Encode RNA Sequences

In [None]:
def one_hot_encode(seq):
    mapping = {'A': 0, 'U': 1, 'G': 2, 'C': 3}
    one_hot = np.zeros((len(seq), 4))
    for i, base in enumerate(seq):
        if base in mapping:
            one_hot[i, mapping[base]] = 1
    return one_hot

def structure_to_labels(dotbracket):
    return np.array([1 if b in ['(', ')'] else 0 for b in dotbracket])


## Prepare Dataset

In [None]:
x_tensors = [torch.tensor(one_hot_encode(seq), dtype=torch.float32) for seq, _ in data]
y_tensors = [torch.tensor(structure_to_labels(struct), dtype=torch.long) for _, struct in data]

x_train_list, x_val_list, y_train_list, y_val_list = train_test_split(x_tensors, y_tensors, test_size=0.2, random_state=42)

x_train = pad_sequence(x_train_list, batch_first=True)
y_train = pad_sequence(y_train_list, batch_first=True, padding_value=-1)
x_val = pad_sequence(x_val_list, batch_first=True)
y_val = pad_sequence(y_val_list, batch_first=True, padding_value=-1)


## CNN Model

In [None]:
class RNACNN(nn.Module):
    def __init__(self):
        super(RNACNN, self).__init__()
        self.conv1 = nn.Conv1d(4, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.fc = nn.Linear(32, 2)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.permute(0, 2, 1)
        x = self.fc(x)
        return x

model = RNACNN()
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


## Train Model

In [None]:
train_losses = []
val_accuracies = []

for epoch in range(25):
    model.train()
    optimizer.zero_grad()
    outputs = model(x_train)
    loss = criterion(outputs.view(-1, 2), y_train.view(-1))
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())

    model.eval()
    with torch.no_grad():
        val_outputs = model(x_val)
        val_preds = torch.argmax(val_outputs, dim=-1)
        mask = y_val != -1
        correct = (val_preds == y_val) & mask
        accuracy = correct.sum().item() / mask.sum().item()
        val_accuracies.append(accuracy)

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}, Train Loss: {loss.item():.4f}, Val Accuracy: {accuracy:.4f}")


## Validation Accuracy Plot

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(val_accuracies, marker='o', linestyle='--', color='green')
plt.xlabel("Epoch")
plt.ylabel("Validation Accuracy")
plt.title("Validation Accuracy During Training")
plt.grid(True)
plt.show()
