In [1]:
import time

import pandas as pd
import numpy as np

import random

from tqdm import tqdm

from scipy.spatial.transform import Rotation as R
from sklearn.preprocessing import normalize
from scipy.spatial import distance_matrix
import warnings
warnings.filterwarnings('ignore')

print("\nLoading data files...")
train_seqs = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv')
valid_seqs = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/validation_sequences.csv')
test_seqs = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/test_sequences.csv')
train_labels = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv')
valid_labels = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/validation_labels.csv')

print(f"Loaded {len(train_seqs)} training sequences, {len(valid_seqs)} validation sequences, and {len(test_seqs)} test sequences")


Loading data files...
Loaded 5716 training sequences, 28 validation sequences, and 28 test sequences


In [2]:
train_seqs.shape

(5716, 8)

In [3]:
train_seqs.head()

Unnamed: 0,target_id,sequence,temporal_cutoff,description,stoichiometry,all_sequences,ligand_ids,ligand_SMILES
0,4TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1978-04-12,FURTHER REFINEMENT OF THE STRUCTURE OF YEAST T...,A:1,>4TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
1,6TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1979-01-16,CRYSTAL STRUCTURE OF YEAST PHENYLALANINE T-RNA...,A:1,>6TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
2,1TRA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-07-14,RESTRAINED REFINEMENT OF THE MONOCLINIC FORM O...,A:1,>1TRA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
3,1TN2,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-10-24,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN2_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN
4,1TN1,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1987-01-15,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN1_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN


In [4]:
train_seqs_v1 = pd.read_csv('/kaggle/input/extended-rna/train_sequences_v2.csv')
train_labels_v1 = pd.read_csv('/kaggle/input/extended-rna/train_labels_v2.csv')

In [5]:
train_seqs_v1.shape

(5135, 5)

In [6]:
train_seqs_v1.head()

Unnamed: 0,target_id,sequence,temporal_cutoff,description,all_sequences
0,7TAX_M,CUAAGAAAUUCACGGCGGGCUUGAUGUCCGCGUCUACCUGAUUCAC...,2022-09-21,Cryo-EM structure of the Csy-AcrIF24-promoter ...,>7TAX_1|Chain A|CRISPR-associated protein Csy1...
1,4WF1_CA,AAUUGAAGAGUUUGAUCAUGGCUCAGAUUGAACGCUGGCGGCAGGC...,2014-11-05,Crystal structure of the E. coli ribosome boun...,">4WF1_1|Chains A[auth AA], BB[auth CA]|16S rRN..."
2,8PVA_b,UGCCUGGCGGCCGUAGCGCGGUGGUCCCACCUGACCCCAUGCCGAA...,2023-11-29,Structure of bacterial ribosome determined by ...,>8PVA_1|Chain A|16S rRNA|Escherichia coli (562...
3,8OVE_BB,CAACUGCAGACCGUACUCAUCACCGCAUCAGGUCCCCAAGCAUCGA...,2023-11-29,CRYO-EM STRUCTURE OF TRYPANOSOMA BRUCEI PROCYC...,>8OVE_1|Chain A[auth AA]|SSU rRNA|Trypanosoma ...
4,8JDL_w,UACCUGGUUGAUCCUGCCAGUAGCAUUGCUUGCCAAAGAUUAAGCC...,2023-12-06,Structure of the Human cytoplasmic Ribosome wi...,>8JDL_1|Chain A|mRNA|Homo sapiens (9606)\nUUAU...


In [7]:
train_seqs_v2 = pd.read_csv('/kaggle/input/rna-cif-to-csv/rna_sequences.csv')
train_labels_v2 = pd.read_csv('/kaggle/input/rna-cif-to-csv/rna_coordinates.csv')

In [8]:
train_seqs_v2.shape

(18881, 2)

In [9]:
train_seqs_v2.head()

Unnamed: 0,target_id,sequence
0,2D19_A,GCUGAAGUGCACACGGC
1,6OXI_QA,GUUGGAGAGUUUGAUCCUGGCUCAGGGUGAACGCUGGCGGCGUGCC...
2,6OXI_QV,CGCGGGGUGGAGCAGCCUGGUAGCUCGUCGGGCUCAUAACCCGAAG...
3,6OXI_QX,CAAGGAGGUAAAAAUGU
4,6OXI_RA,AGAUGGUAAGGGCCCACGGUGGAUGCCUCGGCACCCGAGCCGAUGA...


In [10]:
# Check target_id relationships between the three datasets
import pandas as pd

# Extract unique target_ids from each dataset
train_seqs_ids = set(train_seqs['target_id'].unique())
train_seqs_v1_ids = set(train_seqs_v1['target_id'].unique())
train_seqs_v2_ids = set(train_seqs_v2['target_id'].unique())

print("Dataset sizes:")
print(f"train_seqs unique target_ids: {len(train_seqs_ids)}")
print(f"train_seqs_v1 unique target_ids: {len(train_seqs_v1_ids)}")
print(f"train_seqs_v2 unique target_ids: {len(train_seqs_v2_ids)}")

Dataset sizes:
train_seqs unique target_ids: 5716
train_seqs_v1 unique target_ids: 5135
train_seqs_v2 unique target_ids: 18881


In [11]:
# Find target_ids unique to each dataset (not present in the other two)

# Unique to train_seqs only
unique_to_train_seqs = train_seqs_ids - train_seqs_v1_ids - train_seqs_v2_ids

# Unique to train_seqs_v1 only  
unique_to_train_seqs_v1 = train_seqs_v1_ids - train_seqs_ids - train_seqs_v2_ids

# Unique to train_seqs_v2 only
unique_to_train_seqs_v2 = train_seqs_v2_ids - train_seqs_ids - train_seqs_v1_ids

print("Target IDs unique to each dataset:")
print(f"Unique to train_seqs only: {len(unique_to_train_seqs)}")
print(f"Unique to train_seqs_v1 only: {len(unique_to_train_seqs_v1)}")
print(f"Unique to train_seqs_v2 only: {len(unique_to_train_seqs_v2)}")

print(f"\nTotal unique across all datasets: {len(train_seqs_ids | train_seqs_v1_ids | train_seqs_v2_ids)}")

Target IDs unique to each dataset:
Unique to train_seqs only: 5716
Unique to train_seqs_v1 only: 463
Unique to train_seqs_v2 only: 14209

Total unique across all datasets: 25060


In [12]:
# Merge all three datasets into one without duplicates
# Concatenate all datasets first
all_datasets = pd.concat([train_seqs, train_seqs_v1, train_seqs_v2], ignore_index=True)

# Remove duplicates based on target_id (keeping first occurrence)
merged_train_seqs = all_datasets.drop_duplicates(subset='target_id', keep='first')

print(f"Original combined rows: {len(all_datasets)}")
print(f"After removing duplicates: {len(merged_train_seqs)}")
print(f"Unique target_ids in merged dataset: {merged_train_seqs['target_id'].nunique()}")

Original combined rows: 29732
After removing duplicates: 25060
Unique target_ids in merged dataset: 25060


In [13]:
merged_train_seqs.shape

(25060, 8)

In [14]:
merged_train_seqs.head()

Unnamed: 0,target_id,sequence,temporal_cutoff,description,stoichiometry,all_sequences,ligand_ids,ligand_SMILES
0,4TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1978-04-12,FURTHER REFINEMENT OF THE STRUCTURE OF YEAST T...,A:1,>4TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
1,6TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1979-01-16,CRYSTAL STRUCTURE OF YEAST PHENYLALANINE T-RNA...,A:1,>6TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
2,1TRA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-07-14,RESTRAINED REFINEMENT OF THE MONOCLINIC FORM O...,A:1,>1TRA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
3,1TN2,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1986-10-24,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN2_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN
4,1TN1,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1987-01-15,CRYSTALLOGRAPHIC AND BIOCHEMICAL INVESTIGATION...,A:1,>1TN1_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG;PB;SPM,[Mg+2];[Pb+2];C(CCNCCCN)CNCCCN


In [15]:
# Keep only target_id and sequence columns like the original train_seqs_v2
merged_seqs_final = merged_train_seqs[['target_id', 'sequence']].copy()

print("Final merged sequences dataset:")
print(f"Shape: {merged_seqs_final.shape}")
print(f"Columns: {list(merged_seqs_final.columns)}")
print("\nFirst few rows:")
print(merged_seqs_final.head())

Final merged sequences dataset:
Shape: (25060, 2)
Columns: ['target_id', 'sequence']

First few rows:
  target_id                                           sequence
0      4TNA  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...
1      6TNA  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...
2      1TRA  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...
3      1TN2  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...
4      1TN1  GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...


In [16]:
# Final duplicate check on the merged dataset
print("Duplicate check on final merged dataset:")
print(f"Total rows: {len(merged_seqs_final)}")
print(f"Unique target_ids: {merged_seqs_final['target_id'].nunique()}")
print(f"Duplicate target_ids: {merged_seqs_final['target_id'].duplicated().sum()}")

# Check for any duplicate target_ids specifically
if merged_seqs_final['target_id'].duplicated().any():
   print("\nDuplicate target_ids found:")
   duplicates = merged_seqs_final[merged_seqs_final['target_id'].duplicated(keep=False)]
   print(duplicates.sort_values('target_id'))
else:
   print("\n✓ No duplicate target_ids found - dataset is clean!")

Duplicate check on final merged dataset:
Total rows: 25060
Unique target_ids: 25060
Duplicate target_ids: 0

✓ No duplicate target_ids found - dataset is clean!


In [17]:
valid_seqs.head()

Unnamed: 0,target_id,sequence,temporal_cutoff,description,stoichiometry,all_sequences,ligand_ids,ligand_SMILES
0,8ZNQ,ACCGUGACGGGCCUUUUGGCUAUACGCGGU,2025-06-04,Solution structure of the complex of naphthyri...,A:1,>8ZNQ_1|Chain A[auth A]|RNA (30-MER)|\nACCGUGA...,NAZ,Cc1ccc2ccc(nc2n1)NC(=O)CCNCCC(=O)NCc3ccc4c(n3)...
1,9IWF,GGUGUAUAAGCUCAUUAAUACGGUUUGAGCGUUUCGACCAGGCAAC...,2025-06-04,crystal structure of P. beijingensis xanthine-...,A:1,>9IWF_1|Chain A[auth A]|P. beijingensis xanthi...,GTP;MG;XAN,c1nc2c(n1[C@H]3[C@@H]([C@@H]([C@H](O3)CO[P@](=...
2,9JGM,GGAAGGGGAGUAACUUCAUUGCCGGUCGAUCGUCAUUACGAUGUGU...,2025-06-04,The Escherichia coli yybp riboswitch as a tand...,C:2,">9JGM_1|Chains A[auth C], C[auth D]|yybP ribos...",MG;MN,[Mg+2];[Mn+2]
3,9MME,UAUUUGAAUCAUACCUGCGAUCAACUCGAUGAAUAAAGUACGCCAG...,2025-06-04,ROOLfirm-octamer-wild type,U:8,">9MME_1|Chains A[auth U], B[auth Y], C[auth c]...",K;MG,[K+];[Mg+2]
4,9J09,CUUUUUGACGAAAAACUCGCCUCAGAAGAUAGGGAGAGUCUAAACG...,2025-06-04,Cryo-EM structure of the RdCas12n-sgRNA-DNA co...,R:1,>9J09_4|Chain D[auth R]|sgRNA|\nCUUUUUGACGAAAA...,,


In [18]:
# Check overlap between valid_seqs and merged_seqs_final target_ids
valid_seqs_ids = set(valid_seqs['target_id'].unique())
merged_seqs_ids = set(merged_seqs_final['target_id'].unique())

print("Target ID overlap analysis:")
print(f"valid_seqs unique target_ids: {len(valid_seqs_ids)}")
print(f"merged_seqs_final unique target_ids: {len(merged_seqs_ids)}")

# Find overlapping target_ids
overlap_ids = valid_seqs_ids & merged_seqs_ids
print(f"Overlapping target_ids: {len(overlap_ids)}")

# Find unique to each dataset
unique_to_valid = valid_seqs_ids - merged_seqs_ids
unique_to_merged = merged_seqs_ids - valid_seqs_ids

print(f"Unique to valid_seqs only: {len(unique_to_valid)}")
print(f"Unique to merged_seqs_final only: {len(unique_to_merged)}")

if len(overlap_ids) > 0:
   print(f"\nFirst few overlapping target_ids: {list(overlap_ids)[:5]}")

Target ID overlap analysis:
valid_seqs unique target_ids: 28
merged_seqs_final unique target_ids: 25060
Overlapping target_ids: 0
Unique to valid_seqs only: 28
Unique to merged_seqs_final only: 25060


In [19]:
# Merge valid_seqs with merged_seqs_final
# First, select only target_id and sequence columns from valid_seqs to match structure
valid_seqs_subset = valid_seqs[['target_id', 'sequence']].copy()

# Concatenate with merged_seqs_final
final_complete_seqs = pd.concat([merged_seqs_final, valid_seqs_subset], ignore_index=True)

print("Final complete dataset after adding valid_seqs:")
print(f"Total rows: {len(final_complete_seqs)}")
print(f"Unique target_ids: {final_complete_seqs['target_id'].nunique()}")
print(f"Shape: {final_complete_seqs.shape}")

# Verify no duplicates exist
duplicates = final_complete_seqs['target_id'].duplicated().sum()
print(f"Duplicate target_ids: {duplicates}")

if duplicates == 0:
   print("✓ All datasets successfully merged without duplicates!")

Final complete dataset after adding valid_seqs:
Total rows: 25088
Unique target_ids: 25088
Shape: (25088, 2)
Duplicate target_ids: 0
✓ All datasets successfully merged without duplicates!


### labels  
---

In [20]:
train_labels.shape

(7794971, 8)

In [21]:
train_labels.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,chain,copy
0,157D_1,C,1,4.843,-5.64,13.265,A,1
1,157D_2,G,2,3.385,-7.613,8.267,A,1
2,157D_3,C,3,2.158,-6.751,2.949,A,1
3,157D_4,G,4,2.669,-4.843,-1.773,A,1
4,157D_5,A,5,3.509,0.239,-4.045,A,1


In [22]:
train_labels_v1.shape

(3677095, 6)

In [23]:
train_labels_v1.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,7TAX_M_1,C,1,187.126007,148.246002,210.417999
1,7TAX_M_2,U,2,185.255997,152.968002,204.617996
2,7TAX_M_3,A,3,189.360992,161.802002,205.214996
3,7TAX_M_4,A,4,186.0,156.595993,209.951996
4,7TAX_M_5,G,5,181.947998,158.186996,213.610992


In [24]:
train_labels_v2.shape

(10135546, 6)

In [25]:
train_labels_v2.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1
0,2D19_A_1,G,1,21.51,1.496,-7.581
1,2D19_A_2,C,2,18.701,-3.119,-7.327
2,2D19_A_3,U,3,15.436,-5.959,-4.605
3,2D19_A_4,G,4,11.689,-7.605,-0.303
4,2D19_A_5,A,5,5.803,-4.665,-0.031


In [26]:
valid_labels.shape

(9762, 126)

In [27]:
valid_labels.head()

Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,x_2,y_2,z_2,x_3,...,z_38,x_39,y_39,z_39,x_40,y_40,z_40,chain,copy,Usage
0,8ZNQ_1,A,1,-2.054,-15.062,20.736,-1e+18,-1e+18,-1e+18,-1e+18,...,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,A,1,Public
1,8ZNQ_2,C,2,-1.971,-15.076,15.338,-1e+18,-1e+18,-1e+18,-1e+18,...,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,A,1,Public
2,8ZNQ_3,C,3,-3.35,-13.497,10.444,-1e+18,-1e+18,-1e+18,-1e+18,...,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,A,1,Public
3,8ZNQ_4,G,4,-5.443,-11.044,6.605,-1e+18,-1e+18,-1e+18,-1e+18,...,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,A,1,Public
4,8ZNQ_5,U,5,-6.35,-6.269,4.55,-1e+18,-1e+18,-1e+18,-1e+18,...,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,-1e+18,A,1,Public


In [28]:
# First, standardize valid_labels to match the structure of other label datasets
# Keep only the first 6 columns: ID, resname, resid, x_1, y_1, z_1
valid_labels_subset = valid_labels[['ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1']].copy()

print("Standardized valid_labels:")
print(f"Shape: {valid_labels_subset.shape}")
print(f"Columns: {list(valid_labels_subset.columns)}")
print("\nFirst few rows:")
print(valid_labels_subset.head())

Standardized valid_labels:
Shape: (9762, 6)
Columns: ['ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1']

First few rows:
       ID resname  resid    x_1     y_1     z_1
0  8ZNQ_1       A      1 -2.054 -15.062  20.736
1  8ZNQ_2       C      2 -1.971 -15.076  15.338
2  8ZNQ_3       C      3 -3.350 -13.497  10.444
3  8ZNQ_4       G      4 -5.443 -11.044   6.605
4  8ZNQ_5       U      5 -6.350  -6.269   4.550


In [29]:
# Merge all label datasets into one without duplicates
# Concatenate all label datasets
all_labels = pd.concat([train_labels, train_labels_v1, train_labels_v2, valid_labels_subset], ignore_index=True)

print(f"Original combined label rows: {len(all_labels)}")

# Remove duplicates based on ID column (keeping first occurrence)
merged_labels_final = all_labels.drop_duplicates(subset='ID', keep='first')

print(f"After removing duplicates: {len(merged_labels_final)}")
print(f"Unique IDs in merged dataset: {merged_labels_final['ID'].nunique()}")
print(f"Final shape: {merged_labels_final.shape}")

# Quick duplicate check
duplicates = merged_labels_final['ID'].duplicated().sum()
print(f"Duplicate IDs: {duplicates}")

if duplicates == 0:
   print("✓ All label datasets successfully merged without duplicates!")

Original combined label rows: 21617374
After removing duplicates: 18078695
Unique IDs in merged dataset: 18078695
Final shape: (18078695, 8)
Duplicate IDs: 0
✓ All label datasets successfully merged without duplicates!


In [30]:
# Export both merged datasets
# Export the merged sequences dataset
final_complete_seqs.to_csv('merged_sequences_final.csv', index=False)
print(f"✓ Exported merged sequences: {final_complete_seqs.shape}")

# Export the merged labels dataset
merged_labels_final.to_csv('merged_labels_final.csv', index=False)
print(f"✓ Exported merged labels: {merged_labels_final.shape}")

print("\nExport summary:")
print(f"- merged_sequences_final.csv: {final_complete_seqs.shape[0]:,} rows, {final_complete_seqs.shape[1]} columns")
print(f"- merged_labels_final.csv: {merged_labels_final.shape[0]:,} rows, {merged_labels_final.shape[1]} columns")

✓ Exported merged sequences: (25088, 2)
✓ Exported merged labels: (18078695, 8)

Export summary:
- merged_sequences_final.csv: 25,088 rows, 2 columns
- merged_labels_final.csv: 18,078,695 rows, 8 columns
