In [4]:
!pip install datasets torch matplotlib numpy scikit-learn tqdm




In [5]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("multimolecule/bprna")

print(dataset)

# Show one example
example = dataset['train'][0]
print("\nExample Sequence:\n", example['sequence'])
print("\nExample Structure:\n", example['secondary_structure'])
print("\nLength:", len(example['sequence']))


DatasetDict({
    train: Dataset({
        features: ['id', 'sequence', 'secondary_structure', 'structural_annotation', 'functional_annotation'],
        num_rows: 102318
    })
})

Example Sequence:
 ACACAUGCAAGCGAACGUGAUCUCCAGCUUGCUGGGGGAUUAGUGGCGAACGGGUGAGUAACACGUGAGUAACCUGCCCUUGACUCUGGGAUAAGCCUGGGAAACUGGGUCUAAUACUGGAUACGACCUUCCCACGCAUGUGUGUUGGUGGAAAGCUUUUGUGGUUUUGGAUGGACUCGCGGCCUAUCAGCUUGUUGGUGGGGUAAUGGCCUACCAAGGCGACGACGGGUAGCCGGCCUGAGAGGGUGGACGGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCUGAUGCAGCGACGCCGCGUGAGGGAUGAAGGCCUUCGGGUUGUAAACCUCUUUCAGUAGGGAAGAAGCGAAAGUGACGGUACCUGCAGAAGAAGCGCCGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGGCGCAAGCGUUAUCCGGAAUUAUUGGGCGUAAAGAGCUCGUAGGCGGUUUGUCGCGUCUGCCGUGAAAGUCCGGGGCUUAACUCCGGAUCUGCGGUGGGUACGGGCAGACUUGAGUGAUGUAGGGGAGACUGGAAUUCCUGGUGUAGCGGUGAAAUGCGCAGAUAUCAGGAGGAACACCGAUGGCGAAGGCAGGUCUCUGGGCAUUAACUGACGCUGAGGAGCGAAAGCAUGGGGAGCGAACAGGAUUAGAUACCCUGGUAGUCCAUGCCGUAAACGUUGGGCACUAGGUGUGGGGGACAUUCCACGUUUUCCGCGCCGUAGCUAACGCAUUAAGU

In [6]:
example = dataset['train'][0]

print("\nExample Sequence:\n", example['sequence'])
print("\nExample Structure:\n", example['secondary_structure'])
print("\nLength:", len(example['sequence']))



Example Sequence:
 ACACAUGCAAGCGAACGUGAUCUCCAGCUUGCUGGGGGAUUAGUGGCGAACGGGUGAGUAACACGUGAGUAACCUGCCCUUGACUCUGGGAUAAGCCUGGGAAACUGGGUCUAAUACUGGAUACGACCUUCCCACGCAUGUGUGUUGGUGGAAAGCUUUUGUGGUUUUGGAUGGACUCGCGGCCUAUCAGCUUGUUGGUGGGGUAAUGGCCUACCAAGGCGACGACGGGUAGCCGGCCUGAGAGGGUGGACGGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUGGGGAAUAUUGCACAAUGGGCGCAAGCUGAUGCAGCGACGCCGCGUGAGGGAUGAAGGCCUUCGGGUUGUAAACCUCUUUCAGUAGGGAAGAAGCGAAAGUGACGGUACCUGCAGAAGAAGCGCCGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGGCGCAAGCGUUAUCCGGAAUUAUUGGGCGUAAAGAGCUCGUAGGCGGUUUGUCGCGUCUGCCGUGAAAGUCCGGGGCUUAACUCCGGAUCUGCGGUGGGUACGGGCAGACUUGAGUGAUGUAGGGGAGACUGGAAUUCCUGGUGUAGCGGUGAAAUGCGCAGAUAUCAGGAGGAACACCGAUGGCGAAGGCAGGUCUCUGGGCAUUAACUGACGCUGAGGAGCGAAAGCAUGGGGAGCGAACAGGAUUAGAUACCCUGGUAGUCCAUGCCGUAAACGUUGGGCACUAGGUGUGGGGGACAUUCCACGUUUUCCGCGCCGUAGCUAACGCAUUAAGUGCCCCGCCUGGGGAGUACGGCCGCAAGGCUAAAACUCAAAGGAAUUGACGGGGGCCCGCACAAGCGGCGGAGCAUGCGGAUUAAUUCGAUGCAACGCGAAGAACCUUACCAAGGCUUGACAUGAACCGGAACGCGCAGAGAUGUGUCGGCCACUUGUGGCCGUUUACAGGUGGUGCAUGGU

In [7]:
def structure_to_labels(structure):
    labels = []
    for char in structure:
        if char == '.':
            labels.append(0)
        else:
            labels.append(1)
    return labels

# Test on your example
test_labels = structure_to_labels(example['secondary_structure'])
print("First 50 labels:", test_labels[:50])
print("Length:", len(test_labels))


First 50 labels: [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0]
Length: 1434


In [13]:
MAX_LEN = 350

processed_data = []

for row in dataset["train"]:
    seq = row["sequence"]
    struct = row["secondary_structure"]

    if len(seq) <= MAX_LEN:  # keep only shorter sequences
        labels = structure_to_labels(struct)
        processed_data.append((seq, labels))

len(processed_data)


81800

In [14]:
# Encoding dictionary
base_to_int = {
    "A": 0,
    "C": 1,
    "G": 2,
    "U": 3
}

def encode_sequence(seq):
    encoded = []
    for base in seq:
        encoded.append(base_to_int.get(base, 4))  # Unknown base → 4
    return encoded

# Test with example sequence
encoded_test = encode_sequence(example["sequence"])
print("First 50 encoded bases:", encoded_test[:50])
print("Length:", len(encoded_test))


First 50 encoded bases: [0, 1, 0, 1, 0, 3, 2, 1, 0, 0, 2, 1, 2, 0, 0, 1, 2, 3, 2, 0, 3, 1, 3, 1, 1, 0, 2, 1, 3, 3, 2, 1, 3, 2, 2, 2, 2, 2, 0, 3, 3, 0, 2, 3, 2, 2, 1, 2, 0, 0]
Length: 1434


In [15]:
final_dataset = []

for seq, labels in processed_data:
    encoded_seq = encode_sequence(seq)

    # sanity check
    if len(encoded_seq) == len(labels):
        final_dataset.append((encoded_seq, labels))

len(final_dataset)


81800

In [16]:
from sklearn.model_selection import train_test_split

# Extract sequences and labels separately
X = [seq for seq, _ in final_dataset]
y = [labels for _, labels in final_dataset]

# First: train vs temp (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)

# Second: split temp into val and test (half-half)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

len(X_train), len(X_val), len(X_test)


(57260, 12270, 12270)

In [17]:
import torch

MAX_LEN = 350  # fixed length

def pad_sequence(seq):
    return seq + [0] * (MAX_LEN - len(seq))

def pad_labels(labels):
    return labels + [0] * (MAX_LEN - len(labels))

# Apply padding
X_train_pad = [pad_sequence(seq) for seq in X_train]
y_train_pad = [pad_labels(lbl) for lbl in y_train]

X_val_pad = [pad_sequence(seq) for seq in X_val]
y_val_pad = [pad_labels(lbl) for lbl in y_val]

X_test_pad = [pad_sequence(seq) for seq in X_test]
y_test_pad = [pad_labels(lbl) for lbl in y_test]

# Convert to tensors
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_pad, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val_pad, dtype=torch.long)
y_val_tensor = torch.tensor(y_val_pad, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_pad, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_pad, dtype=torch.float32)

X_train_tensor.shape, X_val_tensor.shape, X_test_tensor.shape


(torch.Size([57260, 350]), torch.Size([12270, 350]), torch.Size([12270, 350]))

In [18]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)
test_loader = DataLoader(test_dataset, batch_size=64)

len(train_loader), len(val_loader), len(test_loader)


(895, 192, 192)

In [19]:
import torch
import torch.nn as nn

class RNA_BiLSTM(nn.Module):
    def __init__(self, vocab_size=5, embed_dim=32, hidden_dim=64, num_layers=1, max_len=512):
        super(RNA_BiLSTM, self).__init__()

        # Convert numbers (0,1,2,3,4) → embedding vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Bidirectional LSTM
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=True,dropout=0.3)

        # Output layer → 1 neuron for binary prediction per position
        self.fc = nn.Linear(hidden_dim * 2, 1)

        # Sigmoid turns output into probability (0-1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)              # (batch, seq_len, embed_dim)
        x, _ = self.lstm(x)                # (batch, seq_len, hidden*2)
        x = self.fc(x)                     # (batch, seq_len, 1)
        return self.sigmoid(x).squeeze(-1) # (batch, seq_len)


# Initialize model
model = RNA_BiLSTM(embed_dim=64, hidden_dim=128, num_layers=2)
model


RNA_BiLSTM(
  (embedding): Embedding(5, 64, padding_idx=0)
  (lstm): LSTM(64, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [20]:
from torch.optim import Adam
from tqdm import tqdm

pos_weight = torch.tensor([3.0])  # can tune 2.0–4.0
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)          # Binary cross entropy loss
optimizer = Adam(model.parameters(), lr=0.0005)

EPOCHS = 10  # we will increase later

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for X_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(X_batch)

        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")


100%|██████████| 895/895 [23:36<00:00,  1.58s/it]


Epoch 1/10, Loss: 0.7979


100%|██████████| 895/895 [23:33<00:00,  1.58s/it]


Epoch 2/10, Loss: 0.7829


100%|██████████| 895/895 [23:23<00:00,  1.57s/it]


Epoch 3/10, Loss: 0.7772


100%|██████████| 895/895 [24:25<00:00,  1.64s/it]


Epoch 4/10, Loss: 0.7740


100%|██████████| 895/895 [24:19<00:00,  1.63s/it]


Epoch 5/10, Loss: 0.7714


100%|██████████| 895/895 [24:12<00:00,  1.62s/it]


Epoch 6/10, Loss: 0.7694


100%|██████████| 895/895 [24:26<00:00,  1.64s/it]


Epoch 7/10, Loss: 0.7677


100%|██████████| 895/895 [24:34<00:00,  1.65s/it]


Epoch 8/10, Loss: 0.7663


100%|██████████| 895/895 [24:17<00:00,  1.63s/it]


Epoch 9/10, Loss: 0.7649


100%|██████████| 895/895 [24:17<00:00,  1.63s/it]

Epoch 10/10, Loss: 0.7638





In [21]:
import numpy as np

model.eval()  # set to evaluation mode
val_losses = []
all_preds = []
all_labels = []

with torch.no_grad():  # no gradient calculation needed
    for X_batch, y_batch in val_loader:
        outputs = model(X_batch)

        loss = criterion(outputs, y_batch)
        val_losses.append(loss.item())

        preds = (outputs > 0.5).float()  # convert probabilities to 0/1 classes

        all_preds.extend(preds.cpu().numpy().flatten())
        all_labels.extend(y_batch.cpu().numpy().flatten())

avg_val_loss = np.mean(val_losses)

print(f"\nValidation Loss: {avg_val_loss:.4f}")



Validation Loss: 0.7635


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, zero_division=0)
recall = recall_score(all_labels, all_preds, zero_division=0)
f1 = f1_score(all_labels, all_preds, zero_division=0)

print("\nEvaluation Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")



Evaluation Metrics:
Accuracy:  0.9462
Precision: 0.7338
Recall:    0.9357
F1 Score:  0.8226


In [23]:
import random

# Pick a random sample from test set
idx = random.randint(0, len(X_test_tensor)-1)

model.eval()
with torch.no_grad():
    sample_input = X_test_tensor[idx].unsqueeze(0)
    prediction = model(sample_input).squeeze(0)

predicted_labels = (prediction > 0.5).int().tolist()
true_labels = y_test_tensor[idx].int().tolist()

print("True labels (first 200):\n", true_labels[:200])
print("\nPredicted labels (first 200):\n", predicted_labels[:200])


True labels (first 200):
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Predicted labels (first 200):
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [24]:
torch.save(model.state_dict(), "rna_bilstm_model.pt")
print("Model saved successfully!")


Model saved successfully!


In [25]:
# Example RNAcentral sequence (you can replace later with real sequence)
test_seq = "AUGGCCGAUACGUUGGCUAGCUAUGUGAAGCG"  # short test example

# Encode sequence
encoded = encode_sequence(test_seq)

# Pad to MAX_LEN
encoded_padded = pad_sequence(encoded)

# Convert to tensor
input_tensor = torch.tensor(encoded_padded).unsqueeze(0)

# Predict
model.eval()
with torch.no_grad():
    prediction = model(input_tensor).squeeze(0)

predicted_labels = (prediction > 0.5).int().tolist()

print("\nSequence:")
print(test_seq)

print("\nPredicted labels for first", len(test_seq), "bases:")
print(predicted_labels[:len(test_seq)])



Sequence:
AUGGCCGAUACGUUGGCUAGCUAUGUGAAGCG

Predicted labels for first 32 bases:
[0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]


In [26]:
from google.colab import files
files.download("rna_bilstm_model.pt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
from google.colab import drive
drive.mount('/content/drive')


ValueError: mount failed

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
!cp rna_bilstm_model.pt /content/drive/MyDrive/
