In [1]:
pip install Biopython



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import numpy as np
from torch import optim
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset, Subset
from tqdm import tqdm
from itertools import chain
from itertools import islice
import random
import zipfile
import io
import re
import os
import glob
import pickle
from torch.nn.utils.rnn import pad_sequence
from Bio import SeqIO
from Bio.PDB import PDBList, PDBParser, PPBuilder
from Bio.Seq import Seq
from Bio.PDB.Polypeptide import is_aa
from Bio.SeqUtils import seq1
from collections import defaultdict
from functools import partial
import multiprocessing
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [3]:
def parse_bpseq_file(fileobj):
    """Parses a single .bpseq into (sequence, dot-bracket)"""
    bases = []
    pair_map = {}
    for line in fileobj:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        parts = line.split()
        if len(parts) < 3:
            continue
        idx = int(parts[0]) - 1
        base = parts[1].upper()
        pair = int(parts[2]) - 1
        bases.append(base)
        if pair >= 0:
            pair_map[idx] = pair

    dot = ['.' for _ in bases]
    used = set()
    for i, j in pair_map.items():
        if i < 0 or j < 0 or i >= len(bases) or j >= len(bases):
            continue
        if i not in used and j not in used:
            dot[i] = '('
            dot[j] = ')'
            used.update([i, j])
    return ''.join(bases), ''.join(dot)

def parse_three_line_txt(fileobj):
  """Parses a 3-line .txt record into (sequence, dot-bracket)"""
  lines = [l.strip() for l in fileobj if l.strip()]
  out = []
  for i in range(0, len(lines), 3):
    if i+2 < len(lines) and lines[i].startswith('>'):
      seq = lines[i+1].upper()
      struct = lines[i+2]
      if len(seq) == len(struct):
        out.append((seq, struct))
  return out

def parse_dbn_file(fileobj):
  """Parses a .dbn file into (sequence, dot-bracket structure)"""
  lines = [line.strip() for line in fileobj if line.strip()]
  content_lines = [line for line in lines if not line.startswith("#")]
  if len(content_lines) >= 2:
    seq = content_lines[0].upper()
    struct = content_lines[1]
    return seq, struct
  else:
    raise ValueError(f"File {filepath} is not a valid .dbn file format.")

In [5]:
def load_from_zip(zip_path, name_filter=None, allowed_exts=('.bpseq', '.txt', '.dbn')):
    all_data = []
    with zipfile.ZipFile(zip_path, 'r') as zf:
        for fname in zf.namelist():
            if not fname.lower().endswith(allowed_exts):
                continue
            if name_filter and name_filter.lower() not in fname.lower():
                continue
            try:
                with zf.open(fname) as file:
                    # Read file content as text
                    content = file.read().decode('utf-8')
                    if fname.endswith('.bpseq'):
                        seq, struct = parse_bpseq_file(io.StringIO(content))
                        if set(seq).issubset({'A', 'U', 'G', 'C'}) and len(seq) == len(struct):
                            all_data.append((seq, struct))
                    elif fname.endswith('.dbn'):
                        seq, struct = parse_dbn_file(io.StringIO(content))
                        if set(seq).issubset({'A', 'U', 'G', 'C'}) and len(seq) == len(struct):
                            all_data.append((seq, struct))
                    elif fname.endswith('.txt'):
                        for seq, struct in parse_three_line_txt(io.StringIO(content)):
                            if set(seq).issubset({'A', 'U', 'G', 'C'}) and len(seq) == len(struct):
                                all_data.append((seq, struct))
            except Exception as e:
                print(f"Skipping {fname}: {e}")
    return all_data

In [6]:
# RNA sequences and dot-bracket structures
RNAdata = load_from_zip('/content/drive/MyDrive/Deep Learning-RNA-Peptide-Interaction/bpRNA_1m_90_bpseqFiles.zip')

In [7]:
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_int = {aa: idx for idx, aa in enumerate(amino_acids)}

pattern = f'[{amino_acids}]+'

def is_standard_sequence(seq):
    return re.fullmatch(pattern, seq) is not None

In [8]:
sequences = []
for record in SeqIO.parse("/content/drive/MyDrive/Deep Learning-RNA-Peptide-Interaction/peptideatlas.fasta", "fasta"):
    seq = str(record.seq).replace("-", "")
    if is_standard_sequence(seq):
        sequences.append(seq)

max_length = max(len(seq) for seq in sequences)

In [9]:
targets = []
for seq in sequences:
    analysis = ProteinAnalysis(seq)
    mw = analysis.molecular_weight()
    pI = analysis.isoelectric_point()
    instability = analysis.instability_index()
    aromaticity = analysis.aromaticity()
    targets.append([mw, pI, instability, aromaticity])

In [10]:
def parse_single_fasta_file(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    sequence = ''.join([line.strip() for line in lines if not line.startswith(">")])
    return sequence

def parse_fasta_file(filepath):
    try:
        return [str(record.seq).strip().upper() for record in SeqIO.parse(filepath, "fasta")]
    except Exception:
        return []

# Function to gather all .fasta/.fa files recursively
def get_fasta_files_from_nested_folders(root_folder, limit=None):
    files = glob.glob(os.path.join(root_folder, "**", "*.fa*"), recursive=True)
    return list(islice(files, limit)) if limit else list(files)

# Function to load sequences from multiple files using multiprocessing
def load_fasta_sequences_parallel(file_list, num_workers=4):
    if not file_list:
        return []
    with multiprocessing.Pool(processes=num_workers) as pool:
        results = pool.imap_unordered(parse_fasta_file, file_list, chunksize=100)
        sequences = []
        for seq_list in tqdm(results, total=len(file_list)):
            sequences.extend(seq_list)
    return sequences

def parse_fasta_sequences(filepath):
    peptides = []
    with open(filepath, 'r') as f:
        current_seq = ''
        for line in f:
            if line.startswith('>'):
                if current_seq:
                    peptides.append(current_seq)
                    current_seq = ''
            else:
                current_seq += line.strip()
        if current_seq:
            peptides.append(current_seq)
    return peptides

fasta_files = get_fasta_files_from_nested_folders("/content/drive/MyDrive/Deep Learning-RNA-Peptide-Interaction/RNA-fastaFiles", limit=200000)
rna_seqs = load_fasta_sequences_parallel(fasta_files, num_workers=8)
peptide_seqs = parse_fasta_sequences("/content/drive/MyDrive/Deep Learning-RNA-Peptide-Interaction/peptideatlas.fasta")

rna_fasta_seqs = set([seq for seq in rna_seqs])
bpseq_seqs = set([seq for seq, struct in RNAdata])

all_unique_seqs = rna_fasta_seqs.union(bpseq_seqs)

RNA_struct = {seq: struct for seq, struct in RNAdata}

100%|██████████| 102321/102321 [07:39<00:00, 222.64it/s]


In [11]:
final_rna_data = []
for seq in all_unique_seqs:
    struct = RNA_struct.get(seq, None)  # None if not available
    final_rna_data.append((seq, struct))

In [12]:
def download_multiple_pdbs(pair_file, out_dir):
    from Bio.PDB import PDBList
    import os
    os.makedirs(out_dir, exist_ok=True)

    pdbl = PDBList()
    pdb_ids = set()

    with open(pair_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2 or parts[0].lower() == "protein":
                continue  # Skip headers or malformed lines
            prot, rna = parts
            pdb_ids.add(prot.split("_")[0].lower())

    for pdb_id in sorted(pdb_ids):
        try:
            url = f"https://files.rcsb.org/download/{pdb_id.upper()}.pdb"
            filepath = os.path.join(out_dir, f"{pdb_id}.pdb")
            if not os.path.exists(filepath):
                import urllib.request
                urllib.request.urlretrieve(url, filepath)
        except Exception:
          print(f'Nothing')


#download_multiple_pdbs("RPI2241.txt", "./pdb_files")

def load_rpi2241_pairs(filepath):
    pairs = set()
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 2 or parts[0].lower() == "protein":
                continue  # Skip headers or malformed lines
            prot, rna = parts
            pairs.add((prot.upper(), rna.upper()))
    return pairs

rpi2241_positive_pairs=load_rpi2241_pairs("/content/drive/MyDrive/Deep Learning-RNA-Peptide-Interaction/RPI2241.txt")

In [13]:
def one_hot_encodeRNA(sequence):
    mapping = {'A': 0, 'U': 1, 'G': 2, 'C': 3}
    one_hot = np.zeros((len(sequence), 4))
    for i, base in enumerate(sequence):
        if base in mapping:
            one_hot[i, mapping[base]] = 1
    return torch.tensor(one_hot, dtype=torch.float)
def structure_to_labels(dotbracket):
    # Use unique labels for each bracket pair
    mapping = {
        '.': 0, '(': 1, ')': 2,
        '[': 3, ']': 4,
        '{': 5, '}': 6,
        '<': 7, '>': 8
    }
    return np.array([mapping.get(char, 0) for char in dotbracket])
def one_hot_encodepeptide(sequence):
    amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
    'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    aa_to_index = {aa: idx for idx, aa in enumerate(amino_acids)}
    one_hot = np.zeros((len(sequence), 20))
    for i, aa in enumerate(sequence):
        if aa in aa_to_index:
            one_hot[i, aa_to_index[aa]] = 1
    return torch.tensor(one_hot, dtype=torch.float)

In [14]:
rna_inputs = []
for seq, struct in final_rna_data:
    seq_tensor = one_hot_encodeRNA(seq)
    struct_tensor = structure_to_labels(struct) if struct else None
    rna_inputs.append((seq_tensor, struct_tensor))

peptide_inputs = []
for pep_seq, props in zip(peptide_seqs, targets):
    pep_tensor = one_hot_encodepeptide(pep_seq)
    pep_properties = torch.tensor(props, dtype=torch.float)
    peptide_inputs.append((pep_tensor, pep_properties))

In [None]:
#negative_pairs = []
#for rna_seq, rna_struct in final_rna_data:
    #for pep_seq in peptide_seqs:
        #if (pep_seq, rna_seq) not in rpi2241_positive_pairs:
            #negative_pairs.append((rna_seq, rna_struct, pep_seq))

In [None]:
filtered_negative_pairs = []
def rna_structure_complexity(dotbracket):
    return dotbracket.count('(') + dotbracket.count('[') + dotbracket.count('{')

def peptide_property_magnitude(properties):
  return np.sum(np.abs(properties))

#for rna_seq, rna_struct, pep_seq in negative_pairs:
    #rna_score = rna_structure_complexity(rna_struct)
    #pep_score = peptide_property_magnitude()
    #if rna_score > 0 and pep_score > 50:  # Example thresholds
        #filtered_negative_pairs.append((rna_seq, rna_struct, pep_seq))

In [None]:
class IntegratedRNAPeptideCNN(nn.Module):
    def __init__(self, rna_cnn, pep_cnn, property_dim):
        super().__init__()
        self.rna_cnn = rna_cnn  # Pretrained or trainable RNA structure CNN
        self.pep_cnn = pep_cnn  # Pretrained or trainable peptide CNN
        self.property_fc = nn.Linear(property_dim, 16)  # For peptide properties
        self.final_fc = nn.Sequential(
            nn.Linear(rna_cnn.embedding_dim + pep_cnn.embedding_dim + 16, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Binary output
        )

    def forward(self, rna_seq, rna_struct, pep_seq, pep_properties):
        rna_embed = self.rna_cnn(rna_seq, rna_struct)
        pep_embed = self.pep_cnn(pep_seq)
        pep_prop_embed = F.relu(self.property_fc(pep_properties))
        combined = torch.cat([rna_embed, pep_embed, pep_prop_embed], dim=1)
        out = self.final_fc(combined)
        return out