<a href="https://colab.research.google.com/github/Rohanrathod7/Kaggle_Notebooks/blob/main/RNA_2/Experiment_Notebook/Raw_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Develop a machine learning model to predict the 3D structures (C1' atom coordinates) of RNA molecules for the Stanford RNA 3D Folding Part 2 competition, ensuring the output is a `submission.csv` file with five predictions per sequence.

## Download Competition Data

### Subtask:
Download all the necessary data for the Stanford RNA 3D Folding Part 2 competition from Kaggle. This step should be completed before disabling internet access, as the submission environment will not allow external downloads.


**Reasoning**:
The first step is to install the Kaggle API client using pip.



In [None]:
pip install kaggle



In [None]:
import os
import subprocess
import polars as pl
from pathlib import Path

# --- CONFIGURATION ---
COMPETITION_NAME = "stanford-rna-3d-folding-2"
DATA_DIR = Path("/content/data")
PDB_DIR = DATA_DIR / "pdb_structures"

# Create directories
DATA_DIR.mkdir(parents=True, exist_ok=True)
PDB_DIR.mkdir(parents=True, exist_ok=True)

def setup_kaggle_api():
    """Moves kaggle.json to the correct location and sets permissions."""
    if not os.path.exists("/root/.kaggle/kaggle.json"):
        print("‚öôÔ∏è Setting up Kaggle API...")
        os.system("mkdir -p /root/.kaggle")
        os.system("cp /content/kaggle.json /root/.kaggle/")
        os.system("chmod 600 /root/.kaggle/kaggle.json")
    else:
        print("‚úÖ Kaggle API already configured.")

def download_core_csvs():
    """Downloads only the lightweight CSV files (Metadata)."""
    print("‚¨áÔ∏è Downloading Core CSVs...")

    # List of files we actually need to start
    files = ["train_sequences.csv", "train_labels.csv", "test_sequences.csv", "sample_submission.csv"]

    for file in files:
        if not (DATA_DIR / file).exists():
            # Download specific file
            cmd = f"kaggle competitions download -c {COMPETITION_NAME} -f {file} -p {DATA_DIR}"
            subprocess.run(cmd, shell=True, check=True)

            # Unzip if necessary
            zip_path = DATA_DIR / (file + ".zip")
            if zip_path.exists():
                print(f"üì¶ Unzipping {file}...")
                os.system(f"unzip -q {zip_path} -d {DATA_DIR}")
                os.system(f"rm {zip_path}") # Clean up zip to save space
        else:
            print(f"‚úÖ {file} already exists.")

def load_data_efficiently():
    """Loads CSVs using Polars for maximum memory efficiency."""
    print("üöÄ Loading data into RAM with Polars...")

    # Polars is 10x faster and uses less RAM than Pandas
    try:
        train_seq = pl.read_csv(DATA_DIR / "train_sequences.csv")
        train_labels = pl.read_csv(DATA_DIR / "train_labels.csv")

        print(f"üìä Training Sequences: {train_seq.shape}")
        print(f"üìä Training Labels: {train_labels.shape}")
        return train_seq, train_labels
    except Exception as e:
        print(f"‚ùå Error loading data: {e}")
        return None, None

def fetch_pdb_structure(sequence_id):
    """
    Lazy Loader: Downloads a SINGLE PDB file when you need it.
    Use this inside your Dataset __getitem__ method.
    """
    pdb_filename = f"pdb_files/{sequence_id}.pdb" # Adjust based on actual repo structure
    target_path = PDB_DIR / f"{sequence_id}.pdb"

    if not target_path.exists():
        # This is a hypothetical command - you often need to download a 'shard' zip
        # For this competition, check if PDBs are in a separate dataset
        print(f"‚¨áÔ∏è Fetching structure for {sequence_id}...")
        # Add specific download logic here if PDBs are individual files
        pass

    return target_path

# --- EXECUTION ---
setup_kaggle_api()
download_core_csvs()
df_seq, df_labels = load_data_efficiently()

# Example: Inspect the first few rows
print(df_seq.head())

‚öôÔ∏è Setting up Kaggle API...
‚¨áÔ∏è Downloading Core CSVs...


CalledProcessError: Command 'kaggle competitions download -c stanford-rna-3d-folding-2 -f train_sequences.csv -p /content/data' returned non-zero exit status 1.

In [None]:
# 1. Connect to your Drive (Uses tiny data)
from google.colab import drive
drive.mount('/content/drive')

# 2. Copy Key from Drive (Internal transfer, instant & free)
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/MyColabKeys/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 3. Download Data (Cloud-to-Cloud transfer. uses 0MB of your data)
!kaggle competitions download -c stanford-rna-3d-folding-2 -f train_sequences.csv
!unzip -q train_sequences.csv.zip

Mounted at /content/drive
Downloading train_sequences.csv to /content
  0% 0.00/36.3M [00:00<?, ?B/s]
100% 36.3M/36.3M [00:00<00:00, 1.07GB/s]
unzip:  cannot find or open train_sequences.csv.zip, train_sequences.csv.zip.zip or train_sequences.csv.zip.ZIP.


In [None]:
# ==========================================
# üöÄ FIX: Correct Competition Name + Part 2 Files
# ==========================================

import os
from google.colab import drive

# 1. Mount Drive
drive.mount('/content/drive')

# --- CONFIGURATION (UPDATED) ---
# The correct slug for Part 2
COMPETITION_NAME = "stanford-rna-3d-folding-2"
KEY_PATH = "/content/drive/MyDrive/MyColabKeys/kaggle.json"

def setup_kaggle():
    print("üîë Setting up Kaggle API...")
    !mkdir -p ~/.kaggle
    !cp "{KEY_PATH}" ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    print("‚úÖ API Key configured.")

def download_essentials():
    print(f"‚¨áÔ∏è Downloading files for: {COMPETITION_NAME}...")

    # Files for Part 2 (Note: filenames might differ slightly, this downloads what exists)
    files_to_download = [
        "train_sequences.csv",
        "train_labels.csv",
        "validation_sequences.csv", # New in Part 2
        "validation_labels.csv",    # New in Part 2
        "test_sequences.csv",
        "sample_submission.csv"
    ]

    for file in files_to_download:
        if not os.path.exists(file):
            print(f"   Downloading {file} ...")
            # We add '|| true' so the script doesn't crash if one file name is slightly different
            !kaggle competitions download -c {COMPETITION_NAME} -f {file} || echo "‚ö†Ô∏è Could not find {file}"

            if os.path.exists(file + ".zip"):
                !unzip -q {file}.zip
                !rm {file}.zip
        else:
            print(f"   ‚úÖ {file} already exists.")

# --- RUN ---
try:
    setup_kaggle()
    download_essentials()
    print("\nüéâ DONE! Check the 'Files' tab on the left.")
    !ls -lh
except Exception as e:
    print(f"\n‚ùå Error: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üîë Setting up Kaggle API...
‚úÖ API Key configured.
‚¨áÔ∏è Downloading files for: stanford-rna-3d-folding-2...
   ‚úÖ train_sequences.csv already exists.
   Downloading train_labels.csv ...
Downloading train_labels.csv to /content
 96% 305M/317M [00:06<00:00, 37.3MB/s]
100% 317M/317M [00:06<00:00, 54.3MB/s]
   Downloading validation_sequences.csv ...
Downloading validation_sequences.csv to /content
  0% 0.00/21.7k [00:00<?, ?B/s]
100% 21.7k/21.7k [00:00<00:00, 59.7MB/s]
   Downloading validation_labels.csv ...
Downloading validation_labels.csv to /content
  0% 0.00/8.09M [00:00<?, ?B/s]
100% 8.09M/8.09M [00:00<00:00, 798MB/s]
   Downloading test_sequences.csv ...
Downloading test_sequences.csv to /content
  0% 0.00/21.7k [00:00<?, ?B/s]
100% 21.7k/21.7k [00:00<00:00, 51.8MB/s]
   Downloading sample_submission.csv ...
Downloading sample_submission.csv to /co

In [None]:
# Force delete the potentially wrong file and re-download just that one
!rm train_sequences.csv
!kaggle competitions download -c stanford-rna-3d-folding-2 -f train_sequences.csv
!unzip -q train_sequences.csv.zip
!rm train_sequences.csv.zip
print("‚úÖ Now you are 100% sure you have the correct Part 2 sequences!")

Downloading train_sequences.csv to /content
  0% 0.00/36.3M [00:00<?, ?B/s]
100% 36.3M/36.3M [00:00<00:00, 802MB/s]
unzip:  cannot find or open train_sequences.csv.zip, train_sequences.csv.zip.zip or train_sequences.csv.zip.ZIP.
rm: cannot remove 'train_sequences.csv.zip': No such file or directory
‚úÖ Now you are 100% sure you have the correct Part 2 sequences!


In [None]:
import pandas as pd

# 1. Load the inputs (Sequences) and targets (Labels/Coordinates)
print("‚è≥ Loading data... (This might take 30 seconds)")
df_train = pd.read_csv('train_sequences.csv')
df_labels = pd.read_csv('train_labels.csv')

# 2. Inspect the data
print(f"Training Sequences: {df_train.shape}")
print(f"Training Labels:    {df_labels.shape}")

print("\n--- First 2 Rows of Sequences ---")
display(df_train.head(2))

print("\n--- First 2 Rows of Labels (The 3D Coordinates you need to predict) ---")
display(df_labels.head(2))

‚è≥ Loading data... (This might take 30 seconds)


  df_labels = pd.read_csv('train_labels.csv')


Training Sequences: (5716, 8)
Training Labels:    (7794971, 8)

--- First 2 Rows of Sequences ---


Unnamed: 0,target_id,sequence,temporal_cutoff,description,stoichiometry,all_sequences,ligand_ids,ligand_SMILES
0,4TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1978-04-12,FURTHER REFINEMENT OF THE STRUCTURE OF YEAST T...,A:1,>4TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]
1,6TNA,GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGG...,1979-01-16,CRYSTAL STRUCTURE OF YEAST PHENYLALANINE T-RNA...,A:1,>6TNA_1|Chain A[auth A]|TRNAPHE|\nGCGGAUUUAGCU...,MG,[Mg+2]



--- First 2 Rows of Labels (The 3D Coordinates you need to predict) ---


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,chain,copy
0,157D_1,C,1,4.843,-5.64,13.265,A,1
1,157D_2,G,2,3.385,-7.613,8.267,A,1
