# Submission 

In [None]:
# ============================================================
# NOTEBOOK-3 / STAGE G â€” Build submission.csv (ONE CELL, Kaggle-SAFE)
#
# EN:
# - Reads test_sequences.csv + sample_submission.csv from COMP_ROOT
# - Reads final selection (Top-5) from RUN_DIR/final_selection/**/final_top5_test.parquet
#   (or final_top5_val.parquet if test not available; will warn)
# - For each test target:
#     * loads coords for 5 candidates (npz with key "coords" or any (L,3))
#     * if missing/unavailable -> uses safe fallback coords (line / helix-like placeholder)
# - Writes /kaggle/working/submission.csv exactly in sample format
#
# ID:
# - Ini akan selalu menghasilkan submission.csv yang valid formatnya.
# - Kalau coords beneran belum ada (DRfold2/gemmi/ext belum aktif), file tetap jadi,
#   tapi skor kompetisi tentu belum bagus. Begitu coords sudah ada, notebook ini otomatis pakai coords itu.
# ============================================================

import os, json, math, hashlib, warnings
from pathlib import Path
from typing import Optional, Tuple, List, Dict

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Paths
# ----------------------------
COMP_ROOT = Path("/kaggle/input/stanford-rna-3d-folding-2")
if not COMP_ROOT.exists():
    raise FileNotFoundError("Missing COMP_ROOT: /kaggle/input/stanford-rna-3d-folding-2")

TEST_SEQ_P = COMP_ROOT / "test_sequences.csv"
SAMPLE_SUB_P = COMP_ROOT / "sample_submission.csv"
if not TEST_SEQ_P.exists():
    raise FileNotFoundError(f"Missing: {TEST_SEQ_P}")
if not SAMPLE_SUB_P.exists():
    raise FileNotFoundError(f"Missing: {SAMPLE_SUB_P}")

# RUN_DIR detection (same as previous)
if "RUN_DIR" in globals():
    RUN_DIR = Path(RUN_DIR)
else:
    base = Path("/kaggle/working/rna3d_run/candidates")
    if not base.exists():
        raise FileNotFoundError("No /kaggle/working/rna3d_run/candidates. Set RUN_DIR or run earlier stages.")
    cfg_dirs = sorted([d for d in base.iterdir() if d.is_dir() and d.name.startswith("cfg_")],
                      key=lambda x: x.stat().st_mtime, reverse=True)
    if not cfg_dirs:
        raise FileNotFoundError("No cfg_* found. Set RUN_DIR or run earlier stages.")
    RUN_DIR = cfg_dirs[0]

OUT_SUB = Path("/kaggle/working/submission.csv")

print("=== SUBMISSION INPUTS ===")
print("COMP_ROOT :", COMP_ROOT)
print("RUN_DIR   :", RUN_DIR)
print("TEST_SEQ  :", TEST_SEQ_P)
print("SAMPLE_SUB:", SAMPLE_SUB_P)
print("OUT_SUB   :", OUT_SUB)

# ----------------------------
# 1) Utilities
# ----------------------------
def stable_seed(s: str) -> int:
    return int(hashlib.sha1(s.encode("utf-8")).hexdigest()[:8], 16)

def load_npz_coords(npz_path: Path) -> Optional[np.ndarray]:
    try:
        z = np.load(npz_path, allow_pickle=False)
        if "coords" in z:
            c = np.asarray(z["coords"])
            if c.ndim == 2 and c.shape == (c.shape[0], 3) and np.isfinite(c).all():
                return c
        # fallback: any (L,3)
        for k in z.files:
            arr = np.asarray(z[k])
            if arr.ndim == 2 and arr.shape[1] == 3 and np.isfinite(arr).all():
                return arr
        return None
    except Exception:
        return None

def clip_coords(c: np.ndarray, lo=-999.999, hi=9999.999) -> np.ndarray:
    c = np.asarray(c, dtype=np.float64)
    return np.clip(c, lo, hi)

def fallback_coords(L: int, seed: int = 0) -> np.ndarray:
    """
    Deterministic placeholder geometry (NOT good for score, but valid).
    Produces a gentle helix-ish curve scaled to Angstrom-like units.
    """
    rng = np.random.default_rng(seed)
    t = np.linspace(0, 1, L, dtype=np.float64)
    # simple helix-like
    x = 10.0 * np.cos(2 * np.pi * t) + 0.2 * rng.standard_normal(L)
    y = 10.0 * np.sin(2 * np.pi * t) + 0.2 * rng.standard_normal(L)
    z = 3.4 * np.arange(L, dtype=np.float64)  # ~base step scale
    c = np.stack([x, y, z], axis=1)
    return c

def get_latest_final_dir(run_dir: Path) -> Path:
    base = run_dir / "final_selection"
    if not base.exists():
        return Path("")
    ds = [d for d in base.iterdir() if d.is_dir() and (d.name.startswith("final_") or d.name.startswith("final"))]
    if not ds:
        return Path("")
    ds.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    return ds[0]

# ----------------------------
# 2) Load test sequences + sample submission
# ----------------------------
df_test_seq = pd.read_csv(TEST_SEQ_P)
df_samp = pd.read_csv(SAMPLE_SUB_P)

if "target_id" not in df_test_seq.columns or "sequence" not in df_test_seq.columns:
    raise ValueError("test_sequences.csv must contain columns: target_id, sequence")

# sample submission schema checks
req_cols = ["ID", "resname", "resid"]
for c in req_cols:
    if c not in df_samp.columns:
        raise ValueError(f"sample_submission missing column: {c}")

# coordinate columns must be 15 (x_1..z_5)
coord_cols = [c for c in df_samp.columns if c.startswith(("x_","y_","z_"))]
if len(coord_cols) != 15:
    raise ValueError(f"Expected 15 coord cols in sample_submission, got {len(coord_cols)}")

# ensure order x_1,y_1,z_1,...,x_5,y_5,z_5
coord_cols_sorted = []
for k in range(1, 6):
    coord_cols_sorted += [f"x_{k}", f"y_{k}", f"z_{k}"]
for c in coord_cols_sorted:
    if c not in df_samp.columns:
        raise ValueError(f"sample_submission missing coord col: {c}")

# ----------------------------
# 3) Load final selection (Top-5 for test)
# ----------------------------
FINAL_DIR = get_latest_final_dir(RUN_DIR)
if not FINAL_DIR:
    raise FileNotFoundError(f"Could not find RUN_DIR/final_selection/*. Run Stage F first. RUN_DIR={RUN_DIR}")

p_test = FINAL_DIR / "final_top5_test.parquet"
p_val  = FINAL_DIR / "final_top5_val.parquet"

if p_test.exists():
    df_top5 = pd.read_parquet(p_test)
    split_used = "test"
elif p_val.exists():
    df_top5 = pd.read_parquet(p_val)
    split_used = "val"
    print("[WARN] final_top5_test.parquet not found; using val selection as fallback for test submission.")
else:
    raise FileNotFoundError(f"Missing final_top5_test.parquet (and no val fallback) in {FINAL_DIR}")

# required columns
need_cols = ["target_id", "final_rank", "coords_path_used"]
for c in need_cols:
    if c not in df_top5.columns:
        raise ValueError(f"final_top5 file missing column: {c}")

df_top5["target_id"] = df_top5["target_id"].astype("string")
df_top5["final_rank"] = pd.to_numeric(df_top5["final_rank"], errors="coerce").fillna(999).astype("int32")
df_top5["coords_path_used"] = df_top5["coords_path_used"].fillna("").astype("string")

# keep only ranks 1..5
df_top5 = df_top5[(df_top5["final_rank"] >= 1) & (df_top5["final_rank"] <= 5)].copy()

# map target -> rank -> path
top5_map: Dict[str, Dict[int, str]] = {}
for r in df_top5.itertuples(index=False):
    tid = str(r.target_id)
    rk = int(r.final_rank)
    p  = str(r.coords_path_used)
    top5_map.setdefault(tid, {})[rk] = p

print("=== FINAL SELECTION LOADED ===")
print("FINAL_DIR :", FINAL_DIR)
print("split_used:", split_used)
print("n_rows    :", len(df_top5))
print("n_targets :", len(top5_map))

# ----------------------------
# 4) Build submission rows following sample_submission IDs
# ----------------------------
# sample_submission has ID like "R1107_1" where prefix is target_id and suffix is resid.
# We'll parse it and fill coords for 5 models.

def parse_sample_id(id_str: str) -> Tuple[str, int]:
    # split on last "_" for safety
    s = str(id_str)
    if "_" not in s:
        return s, 0
    a, b = s.rsplit("_", 1)
    try:
        return a, int(b)
    except Exception:
        return a, 0

# Preload test sequences for L + resname per target (sequence gives resname)
seq_map = dict(zip(df_test_seq["target_id"].astype("string"), df_test_seq["sequence"].astype(str)))

# Cache loaded coords per (target, rank)
coords_cache: Dict[Tuple[str, int], np.ndarray] = {}

def get_coords_for_target_rank(tid: str, rank: int) -> np.ndarray:
    key = (tid, rank)
    if key in coords_cache:
        return coords_cache[key]
    seq = seq_map.get(tid, "")
    L = len(seq)
    p = top5_map.get(tid, {}).get(rank, "")
    coords = None
    if p and Path(p).exists():
        coords = load_npz_coords(Path(p))
    if coords is None:
        coords = fallback_coords(L, seed=stable_seed(f"{tid}|{rank}"))
    # ensure length matches L (truncate/pad)
    coords = np.asarray(coords, dtype=np.float64)
    if coords.shape[0] != L:
        if coords.shape[0] > L:
            coords = coords[:L]
        else:
            pad = fallback_coords(L - coords.shape[0], seed=stable_seed(f"{tid}|{rank}|pad"))
            coords = np.concatenate([coords, pad], axis=0)
    coords = clip_coords(coords)
    coords_cache[key] = coords
    return coords

# Build output dataframe by copying sample template to ensure exact columns/order
df_out = df_samp.copy()

# Fill resname (from sequence) just to be safe (sample already has)
# NOTE: sample has resname/resid; we won't break it.

# Fill coords columns
n_missing_targets = 0
targets_seen = set()

for i in range(len(df_out)):
    tid, resid = parse_sample_id(df_out.at[i, "ID"])
    tid = str(tid)
    targets_seen.add(tid)

    seq = seq_map.get(tid, None)
    if seq is None:
        # unknown target id in sample (shouldn't happen)
        n_missing_targets += 1
        continue

    # 1-based resid -> index 0-based
    idx = int(resid) - 1
    if idx < 0 or idx >= len(seq):
        # out of range, keep zeros
        continue

    # set resname from sequence for safety
    df_out.at[i, "resname"] = seq[idx]
    df_out.at[i, "resid"] = int(resid)

    for k in range(1, 6):
        c = get_coords_for_target_rank(tid, k)
        x, y, z = c[idx, 0], c[idx, 1], c[idx, 2]
        df_out.at[i, f"x_{k}"] = float(x)
        df_out.at[i, f"y_{k}"] = float(y)
        df_out.at[i, f"z_{k}"] = float(z)

# final schema check
expected_cols = ["ID", "resname", "resid"] + coord_cols_sorted
df_out = df_out[expected_cols]

# ensure numeric coords
for c in coord_cols_sorted:
    df_out[c] = pd.to_numeric(df_out[c], errors="coerce").fillna(0.0).astype(np.float64)

# write
df_out.to_csv(OUT_SUB, index=False)

print("\n[OK] submission.csv written:", OUT_SUB)
print("rows:", len(df_out), "| targets_in_sample:", len(targets_seen), "| missing_targets_in_test_seq:", n_missing_targets)
print("preview:")
print(df_out.head(3).to_string(index=False))
