In [1]:
# heyy

In [2]:
!pip install -q /kaggle/input/rna-wheels/wheels/biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [3]:
!ls "/kaggle/input/stanford-rna-3d-folding-2"

extra	 sample_submission.csv	train_sequences.csv
MSA	 test_sequences.csv	validation_labels.csv
PDB_RNA  train_labels.csv	validation_sequences.csv


In [4]:
import os
from tqdm import tqdm
from collections import Counter

files = os.listdir("/kaggle/input/stanford-rna-3d-folding-2/PDB_RNA")
print(f"Total number of files: {len(files)}")

# Get file extensions
extensions = [os.path.splitext(file)[1] for file in files if os.path.splitext(file)[1]]

# Count occurrences of each extension
extension_counts = Counter(extensions)

print(f"File types and their counts:")
for ext, count in sorted(extension_counts.items()):
    print(f"  {ext}: {count}")

Total number of files: 9566
File types and their counts:
  .cif: 9564
  .csv: 1
  .fasta: 1


In [5]:
import pandas as pd
import os

# Find the CSV file name
files = os.listdir("/kaggle/input/stanford-rna-3d-folding-2/PDB_RNA")
csv_files = [f for f in files if f.endswith('.csv')]
print(f"CSV file(s): {csv_files}")

# Read the CSV file
csv_file_path = f"/kaggle/input/stanford-rna-3d-folding-2/PDB_RNA/{csv_files[0]}"

# Skip the problematic header lines
df = pd.read_csv(csv_file_path, on_bad_lines='skip')
print(f"CSV file shape: {df.shape}")

print(f"Column names: {list(df.columns)}")
print(df.head())


CSV file(s): ['pdb_release_dates_NA.csv']
CSV file shape: (9564, 2)
Column names: ['Entry ID', 'Release Date']
  Entry ID Release Date
0     4TNA   1978-04-12
1     6TNA   1979-01-16
2     1TRA   1986-07-14
3     1TN2   1986-10-24
4     1TN1   1987-01-15


In [6]:
# Check unique counts for Entry ID
print(f"Total rows: {len(df)}")
print(f"Unique Entry IDs: {df['Entry ID'].nunique()}")
print(f"Duplicate Entry IDs: {len(df) - df['Entry ID'].nunique()}")

# Convert Release Date to datetime for better analysis
df['Release Date'] = pd.to_datetime(df['Release Date'])

# Get date range
print(f"\nRelease Date range:")
print(f"Earliest date: {df['Release Date'].min()}")
print(f"Latest date: {df['Release Date'].max()}")
print(f"Date span: {(df['Release Date'].max() - df['Release Date'].min()).days} days")

# Check unique release dates
print(f"\nUnique Release Dates: {df['Release Date'].nunique()}")

# Show some examples of duplicates if any exist
if len(df) > df['Entry ID'].nunique():
    print(f"\nExamples of duplicate Entry IDs:")
    duplicates = df[df['Entry ID'].duplicated(keep=False)].sort_values('Entry ID')
    print(duplicates.head(10))

Total rows: 9564
Unique Entry IDs: 9564
Duplicate Entry IDs: 0

Release Date range:
Earliest date: 1978-04-12 00:00:00
Latest date: 2025-12-17 00:00:00
Date span: 17416 days

Unique Release Dates: 1390


## FASTA:  
---

In [7]:
import os

# Find the FASTA file
files = os.listdir("/kaggle/input/stanford-rna-3d-folding-2/PDB_RNA")
fasta_files = [f for f in files if f.endswith('.fasta')]
print(f"FASTA file(s): {fasta_files}")

# Read and examine the FASTA file
fasta_file_path = f"/kaggle/input/stanford-rna-3d-folding-2/PDB_RNA/{fasta_files[0]}"

# Read the file and look at its structure
with open(fasta_file_path, 'r') as f:
    lines = f.readlines()

print(f"Total lines in FASTA file: {len(lines)}")
print(f"\nFirst 20 lines:")
for i, line in enumerate(lines[:20]):
    print(f"Line {i+1}: {repr(line)}")


FASTA file(s): ['pdb_seqres_NA.fasta']
Total lines in FASTA file: 52510

First 20 lines:
Line 1: '>4TNA_A\n'
Line 2: 'GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA\n'
Line 3: '>6TNA_A\n'
Line 4: 'GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA\n'
Line 5: '>1TRA_A\n'
Line 6: 'GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA\n'
Line 7: '>1TN2_A\n'
Line 8: 'GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA\n'
Line 9: '>1TN1_A\n'
Line 10: 'GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA\n'
Line 11: '>2TRA_A\n'
Line 12: 'UCCGUGAUAGUUUAAUGGUCAGAAUGGGCGCUUGUCGCGUGCCAGAUCGGGGUUCAAUUCCCCGUCGCGGAGCCA\n'
Line 13: '>3TRA_A\n'
Line 14: 'UCCGUGAUAGUUUAAUGGUCAGAAUGGGCGCUUGUCGCGUGCCAGAUCGGGGUUCAAUUCCCCGUCGCGGAGCCA\n'
Line 15: '>4TRA_A\n'
Line 16: 'GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA\n'
Line 17: '>2TMV_R\n'
Line 18: 'GAA

In [8]:
# Parse FASTA file to count sequences
sequences = []
current_seq = ""
headers = []

with open(fasta_file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith('>'):  # Header line
            if current_seq:  # Save previous sequence
                sequences.append(current_seq)
                current_seq = ""
            headers.append(line)
        else:  # Sequence line
            current_seq += line
    
    # Don't forget the last sequence
    if current_seq:
        sequences.append(current_seq)

print(f"\nFASTA file summary:")
print(f"Number of sequences: {len(sequences)}")
print(f"Number of headers: {len(headers)}")

if sequences:
    seq_lengths = [len(seq) for seq in sequences]
    print(f"Sequence lengths - Min: {min(seq_lengths)}, Max: {max(seq_lengths)}, Average: {sum(seq_lengths)/len(seq_lengths):.1f}")
    
    print(f"\nFirst few headers:")
    for i, header in enumerate(headers[:5]):
        print(f"  {header}")
    
    print(f"\nFirst sequence (first 100 chars):")
    print(f"  {sequences[0][:100]}...")


FASTA file summary:
Number of sequences: 26255
Number of headers: 26255
Sequence lengths - Min: 2, Max: 19000, Average: 497.4

First few headers:
  >4TNA_A
  >6TNA_A
  >1TRA_A
  >1TN2_A
  >1TN1_A

First sequence (first 100 chars):
  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA...


In [9]:
# Continue with the parsing code from before to see the complete statistics
print(f"\nFASTA file summary:")
print(f"Number of sequences: {len(sequences)}")
print(f"Sequence lengths - Min: {min(seq_lengths)}, Max: {max(seq_lengths)}")

# Count DNA vs RNA vs mixed
dna_count = sum(1 for h in headers if 'DNA' in h and 'RNA' not in h)
rna_count = sum(1 for h in headers if 'RNA' in h and 'DNA' not in h)
mixed_count = sum(1 for h in headers if 'DNA' in h and 'RNA' in h)
print(f"DNA sequences: {dna_count}")
print(f"RNA sequences: {rna_count}")  
print(f"Mixed DNA/RNA: {mixed_count}")

# Count unique PDB IDs
pdb_ids = [h.split('_')[0][1:] for h in headers]  # Remove '>' and chain part
unique_pdbs = len(set(pdb_ids))
print(f"Unique PDB structures: {unique_pdbs}")


FASTA file summary:
Number of sequences: 26255
Sequence lengths - Min: 2, Max: 19000
DNA sequences: 0
RNA sequences: 13
Mixed DNA/RNA: 0
Unique PDB structures: 9564


In [10]:
# Count RNA sequences with less than 1000 nucleotides
rna_short_count = 0
rna_lengths = []

for i, header in enumerate(headers):
   if 'RNA' in header and 'DNA' not in header:  # Pure RNA sequences only
       seq_length = len(sequences[i])
       rna_lengths.append(seq_length)
       if seq_length < 1000:
           rna_short_count += 1

print(f"RNA sequences with less than 1000 nucleotides: {rna_short_count}")
print(f"Total RNA sequences: {len(rna_lengths)}")
print(f"Percentage of RNA sequences < 1000 nt: {rna_short_count/len(rna_lengths)*100:.1f}%")

# Additional statistics for RNA sequences
if rna_lengths:
   print(f"\nRNA sequence length statistics:")
   print(f"Min length: {min(rna_lengths)}")
   print(f"Max length: {max(rna_lengths)}")
   print(f"Average length: {sum(rna_lengths)/len(rna_lengths):.1f}")
   
   # Length distribution
   length_ranges = [
       (0, 100, "1-100"),
       (100, 500, "100-500"), 
       (500, 1000, "500-1000"),
       (1000, 2000, "1000-2000"),
       (2000, float('inf'), "2000+")
   ]
   
   print(f"\nRNA sequence length distribution:")
   for min_len, max_len, label in length_ranges:
       count = sum(1 for length in rna_lengths if min_len < length <= max_len)
       print(f"  {label} nt: {count}")

RNA sequences with less than 1000 nucleotides: 13
Total RNA sequences: 13
Percentage of RNA sequences < 1000 nt: 100.0%

RNA sequence length statistics:
Min length: 3
Max length: 76
Average length: 28.7

RNA sequence length distribution:
  1-100 nt: 13
  100-500 nt: 0
  500-1000 nt: 0
  1000-2000 nt: 0
  2000+ nt: 0


## Train Sequences:  
---

In [11]:
train_sequences = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv")

In [12]:
train_sequences.head()

Unnamed: 0,target_id,sequence,temporal_cutoff,description,stoichiometry,all_sequences,ligand_ids,ligand_SMILES
0,4TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1978-04-12,FURTHER REFINEMENT OF THE STRUCTURE OF YEAST T...,A:1,>4TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
1,6TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1979-01-16,CRYSTAL STRUCTURE OF YEAST PHENYLALANINE T-RNA...,A:1,>6TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
2,1TRA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-07-14,RESTRAINED REFINEMENT OF THE MONOCLINIC FORM O...,A:1,>1TRA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
3,1TN2,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-10-24,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN2_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN
4,1TN1,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1987-01-15,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN1_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN


In [13]:
import pandas as pd

try:
    train_sequences
except NameError:
    train_sequences = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv")

try:
    validation_sequences
except NameError:
    validation_sequences = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/validation_sequences.csv")

v1_only_targets = set(train_sequences['target_id']) - set(validation_sequences['target_id'])
print(f"Target IDs in train but not in validation: {len(v1_only_targets)}")
print(f"First 10: {list(v1_only_targets)[:10]}")


Target IDs in train but not in validation: 5716
First 10: ['1D0U', '6SPC', '6CC3', '7OMA', '5FCI', '1TFY', '7ZQ6', '7Z3O', '8FVI', '8WRQ']


In [14]:
train_labels = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv")

  train_labels = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv")


In [15]:
train_labels.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,chain,copy
0,157D_1,C,1,4.843,-5.64,13.265,A,1
1,157D_2,G,2,3.385,-7.613,8.267,A,1
2,157D_3,C,3,2.158,-6.751,2.949,A,1
3,157D_4,G,4,2.669,-4.843,-1.773,A,1
4,157D_5,A,5,3.509,0.239,-4.045,A,1


In [16]:
import pandas as pd

# Ensure required dataframes exist
try:
    train_labels
except NameError:
    train_labels = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv")

try:
    validation_labels
except NameError:
    validation_labels = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/validation_labels.csv")

# Extract target_ids from labels by removing the residue suffix
v1_label_targets = set(train_labels['ID'].astype(str).str.rsplit('_', n=1).str[0])
v2_label_targets = set(validation_labels['ID'].astype(str).str.rsplit('_', n=1).str[0])

# Check for target_ids in v1 labels that are not in v2 labels
v1_only_label_targets = v1_label_targets - v2_label_targets
print(f"Target IDs in train labels but not in validation labels: {len(v1_only_label_targets)}")
print(f"First 10: {list(v1_only_label_targets)[:10]}")


Target IDs in train labels but not in validation labels: 5716
First 10: ['1D0U', '6SPC', '6CC3', '7OMA', '5FCI', '1TFY', '7ZQ6', '7Z3O', '8FVI', '8WRQ']


In [17]:
import pandas as pd

# Ensure v1 exists
try:
    train_sequences
except NameError:
    train_sequences = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv")

try:
    train_labels
except NameError:
    train_labels = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv")

# Define v2 as validation (since no separate v2 dataset is provided)
try:
    train_sequences_v2
except NameError:
    train_sequences_v2 = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/validation_sequences.csv")

try:
    train_labels_v2
except NameError:
    train_labels_v2 = pd.read_csv("/kaggle/input/stanford-rna-3d-folding-2/validation_labels.csv")

# Concatenate both sequence dataframes
combined_sequences = pd.concat([train_sequences, train_sequences_v2], ignore_index=True)

# Concatenate both label dataframes
combined_labels = pd.concat([train_labels, train_labels_v2], ignore_index=True)

print("combined_sequences:", combined_sequences.shape)
print("combined_labels   :", combined_labels.shape)


combined_sequences: (5744, 8)
combined_labels   : (7804733, 126)


In [18]:
combined_sequences.head()

Unnamed: 0,target_id,sequence,temporal_cutoff,description,stoichiometry,all_sequences,ligand_ids,ligand_SMILES
0,4TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1978-04-12,FURTHER REFINEMENT OF THE STRUCTURE OF YEAST T...,A:1,>4TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
1,6TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1979-01-16,CRYSTAL STRUCTURE OF YEAST PHENYLALANINE T-RNA...,A:1,>6TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
2,1TRA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-07-14,RESTRAINED REFINEMENT OF THE MONOCLINIC FORM O...,A:1,>1TRA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
3,1TN2,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-10-24,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN2_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN
4,1TN1,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1987-01-15,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN1_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN


In [19]:
# Extract target_ids from FASTA headers for RNA sequences only
rna_headers = [h for h in headers if 'RNA' in h and 'DNA' not in h]
fasta_rna_targets = set()

for header in rna_headers:
    # Extract PDB_ID and chain from header like ">1SCL_A mol:na..."
    target_part = header.split()[0][1:]  # Remove '>' and take first part
    fasta_rna_targets.add(target_part)

print(f"RNA targets in FASTA: {len(fasta_rna_targets)}")

# Check overlap with combined sequences
combined_targets = set(combined_sequences['target_id'])
overlap = combined_targets.intersection(fasta_rna_targets)
missing_in_fasta = combined_targets - fasta_rna_targets

print(f"Combined sequence targets: {len(combined_targets)}")
print(f"Overlap (targets in both): {len(overlap)}")
print(f"Missing in FASTA: {len(missing_in_fasta)}")
print(f"Coverage: {len(overlap)/len(combined_targets)*100:.1f}%")

RNA targets in FASTA: 13
Combined sequence targets: 5744
Overlap (targets in both): 0
Missing in FASTA: 5744
Coverage: 0.0%


In [20]:
# Check unique target_id counts in combined sequences
target_counts = combined_sequences['target_id'].value_counts()
print(f"Total unique target_ids: {len(target_counts)}")
print(f"Target_ids appearing more than once: {(target_counts > 1).sum()}")
print(f"\nFirst 10 target_ids and their counts:")
print(target_counts.head(10))

print(f"\nSample target_ids from combined_sequences:")
print(combined_sequences['target_id'].head(10).tolist())

print(f"\nSample target_ids from FASTA RNA headers:")
sample_fasta_targets = list(fasta_rna_targets)[:10]
print(sample_fasta_targets)

Total unique target_ids: 5744
Target_ids appearing more than once: 0

First 10 target_ids and their counts:
target_id
4TNA    1
7NBU    1
7ECL    1
7ECK    1
7ECJ    1
7QWS    1
7QWR    1
7QV3    1
7QV2    1
7QV1    1
Name: count, dtype: int64

Sample target_ids from combined_sequences:
['4TNA', '6TNA', '1TRA', '1TN2', '1TN1', '2TRA', '3TRA', '4TRA', '1RNA', '1ELH']

Sample target_ids from FASTA RNA headers:
['9BDN_MRNA', '1RNA_B', '7TOR_MRNA', '7OSA_mRNA', '7AZO_MRNA', '7OSM_mRNA', '7AZS_TRNA', '9BDP_MRNA', '7TOQ_MRNA', '7AZO_TRNA']


In [21]:
# Compare the naming patterns more clearly
print("Combined sequences target_id pattern:")
print("Format appears to be: [4-char PDB]_[1-2 char chain]")
for target in combined_sequences['target_id'].head(5):
    print(f"  {target}")

print(f"\nFASTA RNA target_id pattern:")
print("Format appears to be: [4-char PDB]_[1-2 char chain]")
for target in list(fasta_rna_targets)[:5]:
    print(f"  {target}")

# Check if there's any case sensitivity or format difference
combined_lower = set(t.lower() for t in combined_sequences['target_id'])
fasta_lower = set(t.lower() for t in fasta_rna_targets)
overlap_lower = combined_lower.intersection(fasta_lower)

print(f"\nCase-insensitive check:")
print(f"Overlap when ignoring case: {len(overlap_lower)}")

# Check PDB codes only (without chain)
combined_pdbs = set(t.split('_')[0] for t in combined_sequences['target_id'])
fasta_pdbs = set(t.split('_')[0] for t in fasta_rna_targets)
pdb_overlap = combined_pdbs.intersection(fasta_pdbs)

print(f"\nPDB code overlap (ignoring chains):")
print(f"Combined PDB codes: {len(combined_pdbs)}")
print(f"FASTA PDB codes: {len(fasta_pdbs)}")
print(f"PDB overlap: {len(pdb_overlap)}")

Combined sequences target_id pattern:
Format appears to be: [4-char PDB]_[1-2 char chain]
  4TNA
  6TNA
  1TRA
  1TN2
  1TN1

FASTA RNA target_id pattern:
Format appears to be: [4-char PDB]_[1-2 char chain]
  9BDN_MRNA
  1RNA_B
  7TOR_MRNA
  7OSA_mRNA
  7AZO_MRNA

Case-insensitive check:
Overlap when ignoring case: 0

PDB code overlap (ignoring chains):
Combined PDB codes: 5744
FASTA PDB codes: 10
PDB overlap: 5


In [22]:
# The case-insensitive overlap suggests the issue is case sensitivity
# Let's examine this more closely

print("Case comparison examples:")
combined_sample = list(combined_sequences['target_id'])[:5]
fasta_sample = list(fasta_rna_targets)[:5]

for target in combined_sample:
    print(f"Combined: {target} -> lowercase: {target.lower()}")

for target in fasta_sample:
    print(f"FASTA: {target} -> lowercase: {target.lower()}")

# Check if FASTA uses lowercase PDB codes
print(f"\nPDB code case analysis:")
combined_pdb_sample = [t.split('_')[0] for t in combined_sample]
fasta_pdb_sample = [t.split('_')[0] for t in fasta_sample]

print("Combined PDB codes:", combined_pdb_sample)
print("FASTA PDB codes:", fasta_pdb_sample)

# Check PDB overlap with case-insensitive comparison
combined_pdbs_lower = set(t.split('_')[0].lower() for t in combined_sequences['target_id'])
fasta_pdbs_lower = set(t.split('_')[0].lower() for t in fasta_rna_targets)
pdb_overlap_lower = combined_pdbs_lower.intersection(fasta_pdbs_lower)

print(f"\nCase-insensitive PDB overlap: {len(pdb_overlap_lower)}")
print(f"This explains the discrepancy - FASTA uses lowercase PDB codes!")

Case comparison examples:
Combined: 4TNA -> lowercase: 4tna
Combined: 6TNA -> lowercase: 6tna
Combined: 1TRA -> lowercase: 1tra
Combined: 1TN2 -> lowercase: 1tn2
Combined: 1TN1 -> lowercase: 1tn1
FASTA: 9BDN_MRNA -> lowercase: 9bdn_mrna
FASTA: 1RNA_B -> lowercase: 1rna_b
FASTA: 7TOR_MRNA -> lowercase: 7tor_mrna
FASTA: 7OSA_mRNA -> lowercase: 7osa_mrna
FASTA: 7AZO_MRNA -> lowercase: 7azo_mrna

PDB code case analysis:
Combined PDB codes: ['4TNA', '6TNA', '1TRA', '1TN2', '1TN1']
FASTA PDB codes: ['9BDN', '1RNA', '7TOR', '7OSA', '7AZO']

Case-insensitive PDB overlap: 5
This explains the discrepancy - FASTA uses lowercase PDB codes!


## `cif` files  
---

In [23]:
# !ls /kaggle/input/stanford-rna-3d-folding/PDB_RNA

In [24]:
# Extract RNA sequences and coordinates with deduplication
from Bio.PDB import MMCIFParser
import pandas as pd
from pathlib import Path

def extract_rna_data_from_cif(cif_file_path):
    """Extract unique RNA sequences and C1' coordinates from a CIF file"""
    parser = MMCIFParser(QUIET=True)
    
    try:
        structure = parser.get_structure('structure', cif_file_path)
        pdb_id = Path(cif_file_path).stem.upper()
        
        sequences_data = []
        coordinates_data = []
        seen_sequences = set()  # Track unique sequences
        
        for model in structure:
            for chain in model:
                chain_id = chain.id
                target_id = f"{pdb_id}_{chain_id}"
                
                # Check if chain contains RNA residues
                rna_residues = []
                for residue in chain:
                    if residue.get_resname() in ['A', 'U', 'G', 'C']:  # RNA nucleotides
                        rna_residues.append(residue)
                
                if rna_residues:  # Only process if RNA residues found
                    # Build sequence
                    sequence = ''.join([res.get_resname() for res in rna_residues])
                    
                    # Only add if sequence is unique
                    if sequence not in seen_sequences:
                        seen_sequences.add(sequence)
                        sequences_data.append({
                            'target_id': target_id,
                            'sequence': sequence
                        })
                        
                        # Extract C1' coordinates for this unique sequence
                        for i, residue in enumerate(rna_residues, 1):
                            if "C1'" in residue:
                                atom = residue["C1'"]
                                coordinates_data.append({
                                    'ID': f"{target_id}_{i}",
                                    'resname': residue.get_resname(),
                                    'resid': i,
                                    'x_1': atom.coord[0],
                                    'y_1': atom.coord[1], 
                                    'z_1': atom.coord[2]
                                })
        
        return sequences_data, coordinates_data
        
    except Exception as e:
        print(f"Error processing {cif_file_path}: {e}")
        return [], []

In [25]:
# Process all CIF files and save to CSV
import os
from tqdm import tqdm

def process_all_cif_files():
    """Process all CIF files in the directory and extract RNA data"""
    cif_dir = "/kaggle/input/stanford-rna-3d-folding-2/PDB_RNA"
    cif_files = [f for f in os.listdir(cif_dir) if f.endswith('.cif')]
    
    all_sequences = []
    all_coordinates = []
    
    print(f"Processing {len(cif_files)} CIF files...")
    
    for cif_file in tqdm(cif_files):
        cif_path = os.path.join(cif_dir, cif_file)
        sequences, coordinates = extract_rna_data_from_cif(cif_path)
        
        all_sequences.extend(sequences)
        all_coordinates.extend(coordinates)
    
    return all_sequences, all_coordinates

# Process all files
print("Starting full extraction...")
all_sequences, all_coordinates = process_all_cif_files()

print(f"\nFull extraction summary:")
print(f"Total unique RNA sequences: {len(all_sequences)}")
print(f"Total coordinate entries: {len(all_coordinates)}")

# Create DataFrames
sequences_df = pd.DataFrame(all_sequences)
coordinates_df = pd.DataFrame(all_coordinates)

# Save to CSV files
sequences_df.to_csv('rna_sequences.csv', index=False)
coordinates_df.to_csv('rna_coordinates.csv', index=False)

print(f"\nSaved files:")
print(f"rna_sequences.csv: {sequences_df.shape}")
print(f"rna_coordinates.csv: {coordinates_df.shape}")

print(f"\nFirst few entries:")
print(sequences_df.head())

Starting full extraction...
Processing 9564 CIF files...


100%|██████████| 9564/9564 [9:07:54<00:00,  3.44s/it]



Full extraction summary:
Total unique RNA sequences: 20601
Total coordinate entries: 10990376

Saved files:
rna_sequences.csv: (20601, 2)
rna_coordinates.csv: (10990376, 6)

First few entries:
  target_id                                           sequence
0    2D19_A                                  GCUGAAGUGCACACGGC
1   6OXI_QA  GUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUGCC...
2   6OXI_QV  CGCGGGGUGGAGCAGCCUGGUAGCUCGUCGGGCUCAUAACCCGAAG...
3   6OXI_QX                                  CAAGGAGGUAAAAAUGU
4   6OXI_RA  AGAUGGUAAGGGCCCACGGUGGAUGCCUCGGCACCCGAGCCGAUGA...


In [26]:
# MSA     - 856
# MSA_v2  - 2534
# PDB_RNA - 8672