# 05 CAFA E2E — Run LogReg for MF (wrapper)

Self-contained runner: executes `notebooks/05_cafa_e2e.ipynb` with `TARGET_ASPECT=MF`, stopping before the DNN cell.

Key behaviour:
- No subprocess calls
- Logging-first diagnostics (search paths, stop-marker, failures)
- Produces per-aspect artefacts: `oof_pred_logreg_MF.npy`, `test_pred_logreg_MF.npy`

In [None]:
# CELL 01 - Setup (NO REPO)
import os
import sys
import ctypes
from pathlib import Path

# CUDA loader fix (PyTorch/RAPIDS coexistence): preload venv nvjitlink so we don't pick /usr/local/cuda/lib64
try:
    _venv_root = Path(sys.executable).resolve().parent.parent
    _nvjit_dir = (
        _venv_root
        / "lib"
        / f"python{sys.version_info.major}.{sys.version_info.minor}"
        / "site-packages"
        / "nvidia"
        / "nvjitlink"
        / "lib"
    )
    _nvjit_so = _nvjit_dir / "libnvJitLink.so.12"
    if _nvjit_so.exists():
        ctypes.CDLL(str(_nvjit_so), mode=ctypes.RTLD_GLOBAL)
        os.environ["LD_LIBRARY_PATH"] = f"{_nvjit_dir}:{os.environ.get('LD_LIBRARY_PATH','')}"
        print(f"[ENV] Preloaded nvjitlink: {_nvjit_so}")
except Exception as _e:
    print(f"[ENV] nvjitlink preload skipped: {_e}")

# Always run from a simple writable location; never cd into a repo.
if os.path.exists('/content'):
    os.chdir('/content')
RUNTIME_ROOT = Path.cwd()
DATA_ROOT = (RUNTIME_ROOT / 'cafa6_data')
DATA_ROOT.mkdir(parents=True, exist_ok=True)
TRAIN_LEVEL1 = True
print(f'CWD: {Path.cwd()}')
print(f'DATA_ROOT: {DATA_ROOT.resolve()}')

In [None]:
# CELL 13a - Setup & Data Loading (Phase 2 canonical)
# =============================================
# 4. PHASE 2: LEVEL-1 MODELS (DIVERSE ENSEMBLE)
# =============================================
# Target selection source-of-truth: Colab_04b_first_submission_no_ankh.ipynb (aspect-split Top-K)


if TRAIN_LEVEL1:
    import gc
    import json
    import os
    from pathlib import Path

    import numpy as np
    import pandas as pd
    import psutil

    # AUDITOR: Hardware Check
    try:
        import torch

        if torch.cuda.is_available():
            print(f"[AUDITOR] GPU Detected: {torch.cuda.get_device_name(0)}")
            print(
                f"[AUDITOR] VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
            )
        else:
            print("[AUDITOR] WARNING: No GPU detected.")
    except Exception:
        pass

    def log_mem(tag: str = "") -> None:
        try:
            mem = psutil.virtual_memory()
            print(
                f"[MEM] {tag:<30} | Used: {mem.used/1e9:.2f}GB / {mem.total/1e9:.2f}GB ({mem.percent}%)"
            )
        except Exception:
            pass

    # WORK_ROOT recovery (safety)
    # Prefer canonical dataset root (cafa6_data/) and validate by presence of parsed artefacts.
    if "WORK_ROOT" not in locals() and "WORK_ROOT" not in globals():
        candidates = [
            Path("/content/cafa6_data"),
            Path("/content/work"),
            Path("/kaggle/working/work"),
            Path.cwd() / "cafa6_data",
            Path.cwd() / "artefacts_local" / "work",
        ]

        WORK_ROOT = None
        for c in candidates:
            if (c / "parsed" / "train_terms.parquet").exists():
                WORK_ROOT = c
                break

        if WORK_ROOT is None:
            for c in candidates:
                if c.exists():
                    WORK_ROOT = c
                    break

        if WORK_ROOT is None:
            WORK_ROOT = Path.cwd() / "cafa6_data"

        print(f"WORK_ROOT recovered: {WORK_ROOT}")

    # -----------------------------
    # Load targets + ids
    # -----------------------------
    print("Loading targets...")
    train_terms = pd.read_parquet(WORK_ROOT / "parsed" / "train_terms.parquet")
    train_ids = pd.read_feather(WORK_ROOT / "parsed" / "train_seq.feather")["id"].astype(str)
    test_ids = pd.read_feather(WORK_ROOT / "parsed" / "test_seq.feather")["id"].astype(str)

    # FIX: Clean IDs in train_ids to match EntryID format
    print("Applying ID cleaning fix...")
    train_ids_clean = train_ids.str.extract(r"\|(.*?)\|")[0]
    train_ids_clean = train_ids_clean.fillna(train_ids)

    # -----------------------------
    # Target Matrix Construction (Champion Strategy: 13,500 Terms)
    # 10,000 BP + 2,000 MF + 1,500 CC
    # -----------------------------
    print("Selecting Top-K terms per aspect (Champion Strategy)...")

    try:
        import obonet

        # Robust OBO Path Search
        possible_paths = [
            WORK_ROOT / "go-basic.obo",
            WORK_ROOT / "Train" / "go-basic.obo",
            WORK_ROOT.parent / "go-basic.obo",
            Path("go-basic.obo"),
            Path("Train/go-basic.obo"),
            Path("../Train/go-basic.obo"),
            Path("/content/cafa6_data/Train/go-basic.obo"),
        ]

        obo_path = None
        for p in possible_paths:
            if p.exists():
                obo_path = p
                break

        if obo_path is None:
            raise FileNotFoundError(
                f"CRITICAL: go-basic.obo not found. Searched: {[str(p) for p in possible_paths]}"
            )

        global PATH_GO_OBO
        PATH_GO_OBO = obo_path
        print(f"Global PATH_GO_OBO set to: {PATH_GO_OBO}")

        print(f"Loading OBO from {obo_path}...")
        graph = obonet.read_obo(obo_path)
        term_to_ns = {
            node: data.get("namespace", "unknown") for node, data in graph.nodes(data=True)
        }

        # Keep compatibility with downstream code that expects go_namespaces
        go_namespaces = term_to_ns

        ns_map = {
            "biological_process": "BP",
            "molecular_function": "MF",
            "cellular_component": "CC",
        }

        # Normalise any existing aspect column (some artefacts store full namespace strings)
        aspect_aliases = {
            "biological_process": "BP",
            "molecular_function": "MF",
            "cellular_component": "CC",
            "BP": "BP",
            "MF": "MF",
            "CC": "CC",
        }
        if "aspect" in train_terms.columns:
            train_terms["aspect"] = train_terms["aspect"].map(
                lambda a: aspect_aliases.get(str(a), "UNK")
            )
        else:
            train_terms["aspect"] = train_terms["term"].map(
                lambda t: ns_map.get(term_to_ns.get(t), "UNK")
            )

    except ImportError as e:
        raise RuntimeError("obonet not installed. Please install it.") from e

    # Canonical aspect split (04b)
    term_counts = train_terms.groupby(["aspect", "term"]).size().reset_index(name="count")
    targets_bp = (
        term_counts[term_counts["aspect"] == "BP"].nlargest(10000, "count")["term"].tolist()
    )
    targets_mf = (
        term_counts[term_counts["aspect"] == "MF"].nlargest(2000, "count")["term"].tolist()
    )
    targets_cc = (
        term_counts[term_counts["aspect"] == "CC"].nlargest(1500, "count")["term"].tolist()
    )

    # Guardrail: avoid silently switching target strategy due to aspect encoding mismatch
    ALLOW_GLOBAL_FALLBACK = False
    if len(targets_bp) == 0 and len(targets_mf) == 0 and len(targets_cc) == 0:
        aspect_vc = train_terms["aspect"].value_counts().to_dict() if "aspect" in train_terms.columns else {}
        msg = (
            "No BP/MF/CC aspect split found after normalisation. "
            f"aspect_vc={aspect_vc}. This would fall back to global Top-13,500; "
            "set ALLOW_GLOBAL_FALLBACK=True to override."
        )
        if ALLOW_GLOBAL_FALLBACK:
            print("  [WARNING] " + msg)
            top_terms = train_terms["term"].value_counts().head(13500).index.tolist()
        else:
            raise RuntimeError(msg)
    else:
        # Stable, deterministic ordering: BP then MF then CC with de-dup preserving order
        top_terms = []
        seen = set()
        for t in (targets_bp + targets_mf + targets_cc):
            if t not in seen:
                top_terms.append(t)
                seen.add(t)
        print(f"  Selected: {len(targets_bp)} BP + {len(targets_mf)} MF + {len(targets_cc)} CC")

    # Persist label contract for downstream stages
    top_terms_path = WORK_ROOT / "features" / "top_terms_13500.json"
    top_terms_path.parent.mkdir(parents=True, exist_ok=True)
    if top_terms_path.exists():
        try:
            with open(top_terms_path, "r", encoding="utf-8") as f:
                top_terms_disk = json.load(f)
            if isinstance(top_terms_disk, list) and len(top_terms_disk) > 0:
                top_terms = [str(x) for x in top_terms_disk]
                print(f"Loaded existing top_terms_13500.json (n={len(top_terms)})")
        except Exception as e:
            print(f"[WARNING] Failed to load existing top_terms_13500.json: {e}")
    else:
        with open(top_terms_path, "w", encoding="utf-8") as f:
            json.dump(list(top_terms), f)
        print("Saved: top_terms_13500.json")

    # -----------------------------
    # Stable target contract (audited: 1,585 terms)
    # Definition: GO terms with >= 50 positives AND valid namespace (BP/MF/CC)
    # Stored separately from top_terms_13500.json (do not mix contracts).
    # -----------------------------
    stable_terms_path = WORK_ROOT / "features" / "stable_terms_1585.json"
    stable_meta_path = WORK_ROOT / "features" / "stable_terms_1585_meta.json"
    noise_floor = 50

    if stable_terms_path.exists():
        try:
            stable_terms = json.loads(stable_terms_path.read_text(encoding="utf-8"))
            stable_terms = [str(t) for t in stable_terms]
            print(f"Loaded existing stable_terms_1585.json (n={len(stable_terms)})")
        except Exception as e:
            raise RuntimeError(f"Failed to load {stable_terms_path}: {e}")
    else:
        # Compute from Phase-1 truth (train_terms.parquet) and OBO namespace mapping already loaded above.
        stable_bp = (
            term_counts[(term_counts["aspect"] == "BP") & (term_counts["count"] >= noise_floor)]
            .sort_values(["count", "term"], ascending=[False, True])["term"]
            .astype(str)
            .tolist()
        )
        stable_mf = (
            term_counts[(term_counts["aspect"] == "MF") & (term_counts["count"] >= noise_floor)]
            .sort_values(["count", "term"], ascending=[False, True])["term"]
            .astype(str)
            .tolist()
        )
        stable_cc = (
            term_counts[(term_counts["aspect"] == "CC") & (term_counts["count"] >= noise_floor)]
            .sort_values(["count", "term"], ascending=[False, True])["term"]
            .astype(str)
            .tolist()
        )
        stable_terms = stable_bp + stable_mf + stable_cc
        stable_terms_path.write_text(json.dumps(stable_terms), encoding="utf-8")
        stable_meta_path.write_text(
            json.dumps(
                {
                    "noise_floor": noise_floor,
                    "counts": {"BP": len(stable_bp), "MF": len(stable_mf), "CC": len(stable_cc)},
                    "total": len(stable_terms),
                },
                indent=2,
            ),
            encoding="utf-8",
        )
        print(f"Saved: stable_terms_1585.json (n={len(stable_terms)})")

    if len(stable_terms) != 1585:
        raise RuntimeError(f"Stable term contract mismatch: expected 1585, got {len(stable_terms)}")

    top_term_to_idx = {t: i for i, t in enumerate(top_terms)}
    missing_stable = [t for t in stable_terms if t not in top_term_to_idx]
    if missing_stable:
        raise RuntimeError(
            "Stable terms contain items not present in top_terms_13500.json. "
            f"Missing={len(missing_stable)} (example: {missing_stable[:10]})"
        )

    stable_idx = np.asarray([top_term_to_idx[t] for t in stable_terms], dtype=np.int64)
    print(f"Stable targets ready: n={int(stable_idx.shape[0])} (expected 1585)")

    train_terms_top = train_terms[train_terms["term"].isin(top_terms)]
    Y_df = train_terms_top.pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
    Y_df = Y_df.reindex(train_ids_clean, fill_value=0)
    Y = Y_df.values.astype(np.float32)
    print(f"Targets: Y={Y.shape}")

    # -----------------------------
    # Feature loading helper (Memory Optimised)
    # -----------------------------
    FEAT_DIR = WORK_ROOT / "features"

    def load_features_dict(split: str = "both"):
        log_mem(f"Start load_features_dict({split})")
        print(f"Loading multimodal features (mode={split})...")

        ft_train = {}
        ft_test = {}

        def _load_pair(stem: str):
            tr = FEAT_DIR / f"train_embeds_{stem}.npy"
            te = FEAT_DIR / f"test_embeds_{stem}.npy"
            return tr, te

        # All modalities are mandatory.
        stems = [
            ("t5", "t5"),
            ("esm2", "esm2_650m"),
            ("esm2_3b", "esm2_3b"),
            ("ankh", "ankh"),
            ("text", "text"),
        ]

        for stem, key in stems:
            tr_path, te_path = _load_pair(stem)
            if not (tr_path.exists() and te_path.exists()):
                raise FileNotFoundError(f"Missing mandatory embeddings for {stem}: {tr_path} or {te_path}")

            if split in ["both", "train"]:
                ft_train[key] = np.load(tr_path, mmap_mode="r")
            if split in ["both", "test"]:
                ft_test[key] = np.load(te_path, mmap_mode="r")

        taxa_train_path = WORK_ROOT / "parsed" / "train_taxa.feather"
        taxa_test_path = WORK_ROOT / "parsed" / "test_taxa.feather"

        if not (taxa_train_path.exists() and taxa_test_path.exists()):
            raise FileNotFoundError(f"Missing mandatory taxa features: {taxa_train_path} or {taxa_test_path}")

        from sklearn.preprocessing import OneHotEncoder

        tax_tr = pd.read_feather(taxa_train_path).astype({"id": str})
        tax_te = pd.read_feather(taxa_test_path).astype({"id": str})
        enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=np.float32)
        enc.fit(pd.concat([tax_tr[["taxon_id"]], tax_te[["taxon_id"]]], axis=0))

        if split in ["both", "train"]:
            tax_tr = tax_tr.set_index("id").reindex(train_ids, fill_value=0).reset_index()
            ft_train["taxa"] = enc.transform(tax_tr[["taxon_id"]]).astype(np.float32)
        if split in ["both", "test"]:
            tax_te = tax_te.set_index("id").reindex(test_ids, fill_value=0).reset_index()
            ft_test["taxa"] = enc.transform(tax_te[["taxon_id"]]).astype(np.float32)

        log_mem(f"End load_features_dict({split})")
        if split == "train":
            return ft_train
        if split == "test":
            return ft_test
        return ft_train, ft_test

    # Materialise feature dicts (mmap arrays where possible)
    features_train, features_test = load_features_dict(split="both")

    # Flat concatenation order for classical models (LR/GBDT)
    FLAT_KEYS = [k for k in ["t5", "esm2_650m", "esm2_3b", "ankh", "text", "taxa"] if k in features_train]
    if "ankh" not in FLAT_KEYS:
        raise RuntimeError("Ankh is mandatory but was not loaded into features_train.")
    print(f"Flat X keys={FLAT_KEYS}")

    # -----------------------------
    # Disk-backed X / X_test (for RAM-safe downstream cells)
    # -----------------------------
    X_train_path = FEAT_DIR / "X_train_mmap.npy"
    X_test_path = FEAT_DIR / "X_test_mmap.npy"

    def _build_X_memmaps(chunk_size: int = 10000) -> None:
        dims = {k: int(features_train[k].shape[1]) for k in FLAT_KEYS}
        total_dim = int(sum(dims.values()))
        n_tr = int(len(train_ids))
        n_te = int(len(test_ids))

        print(f"Building X memmaps: train=({n_tr}, {total_dim}) test=({n_te}, {total_dim})")
        X_mm = np.lib.format.open_memmap(
            str(X_train_path), mode="w+", dtype=np.float32, shape=(n_tr, total_dim)
        )
        Xte_mm = np.lib.format.open_memmap(
            str(X_test_path), mode="w+", dtype=np.float32, shape=(n_te, total_dim)
        )

        col = 0
        for k in FLAT_KEYS:
            d = dims[k]
            print(f"  Streaming {k} into cols {col}:{col + d}")
            for i in range(0, n_tr, chunk_size):
                j = min(i + chunk_size, n_tr)
                X_mm[i:j, col : col + d] = np.asarray(features_train[k][i:j], dtype=np.float32)
            for i in range(0, n_te, chunk_size):
                j = min(i + chunk_size, n_te)
                Xte_mm[i:j, col : col + d] = np.asarray(features_test[k][i:j], dtype=np.float32)
            col += d

        X_mm.flush()
        Xte_mm.flush()

    if X_train_path.exists() and X_test_path.exists():
        print("X memmaps already exist; skipping build.")
    else:
        _build_X_memmaps(chunk_size=5000)

    X = np.load(X_train_path, mmap_mode="r")
    X_test = np.load(X_test_path, mmap_mode="r")

    log_mem("Phase 2 setup done")


In [None]:
# CELL 13c - Level 1: Logistic Regression — Long tail 13,500 (Aspect Split BP/MF/CC)
# ==============================================================================
# Rank 1 Optimization: Asynchronous GPU Pipelining + Automated Artifact Sync
# ==============================================================================
if not TRAIN_LEVEL1:
    print('Skipping LogReg (TRAIN_LEVEL1=False).')
else:
    import os, sys, time, threading, gc, warnings, psutil, json
    import numpy as np
    import pandas as pd
    import joblib
    from pathlib import Path
    from tqdm.auto import tqdm
    from sklearn.model_selection import KFold
    from sklearn.preprocessing import StandardScaler
    from cuml.linear_model import LogisticRegression as cuLogReg
    from cuml.multiclass import OneVsRestClassifier as cuOVR
    import cupy as cp
    import obonet

    def _stage(msg):
        print(msg); sys.stdout.flush()

    # --- WORK_ROOT Recovery (Standalone Support) ---
    if 'WORK_ROOT' not in locals() and 'WORK_ROOT' not in globals():
        WORK_ROOT = Path.cwd() / 'cafa6_data'
        print(f"WORK_ROOT recovered: {WORK_ROOT}")

    # --- OBO / Aspect Logic (Standalone Support) ---
    # We need to know which aspect (BP/MF/CC) each term belongs to.
    try:
        candidates = [
            Path(WORK_ROOT) / "go-basic.obo",
            Path(WORK_ROOT) / "Train" / "go-basic.obo",
            Path(WORK_ROOT).parent / "go-basic.obo",
            Path("go-basic.obo"),
            Path("Train/go-basic.obo"),
            Path("../Train/go-basic.obo"),
            Path("cafa6_data/go-basic.obo"),
            Path("../cafa6_data/go-basic.obo"),
            Path("/content/cafa6_data/Train/go-basic.obo"),
        ]
        obo_path = next((p for p in candidates if p.exists()), None)
        if obo_path:
            print(f"Loading OBO from {obo_path}...")
            graph = obonet.read_obo(obo_path)
            _term_to_ns = {id_: data.get('namespace', 'unknown') for id_, data in graph.nodes(data=True)}
            _ns_alias = {'biological_process': 'BP', 'molecular_function': 'MF', 'cellular_component': 'CC'}
            def _aspect_of_term(t):
                return _ns_alias.get(_term_to_ns.get(t), 'UNK')
        else:
            print(f"WARNING: go-basic.obo not found in {[str(p) for p in candidates]}. Aspect splitting will fail (all UNK).")
            def _aspect_of_term(t): return 'UNK'
    except Exception as e:
        print(f"Failed to load OBO: {e}")
        def _aspect_of_term(t): return 'UNK'

    # --- HELPER: Fast-Path Predict Proba (Manual GEMM) [Diagnostic Fix] ---
    def safe_predict_proba_gpu(clf, x_gpu):
        W, b = None, None
        # Exhaustive weights search to prevent AttributeErrors [Turn 8]
        if hasattr(clf, 'multiclass_estimator'):
            m_est = clf.multiclass_estimator
            if hasattr(m_est, 'coef_'):
                W, b = cp.asarray(m_est.coef_, dtype=cp.float32), cp.asarray(m_est.intercept_, dtype=cp.float32)
            elif hasattr(m_est, 'estimators_'):
                # Handle case where estimators might be on CPU or GPU
                try:
                    ws = []
                    bs = []
                    for e in m_est.estimators_:
                        w = e.coef_
                        b_val = e.intercept_
                        if not isinstance(w, cp.ndarray): w = cp.asarray(w)
                        if not isinstance(b_val, cp.ndarray): b_val = cp.asarray(b_val)
                        ws.append(w)
                        bs.append(b_val)
                    W = cp.vstack(ws)
                    b = cp.hstack(bs)
                except Exception:
                    pass

        if W is None and hasattr(clf, 'coef_'):
            W, b = cp.asarray(clf.coef_, dtype=cp.float32), cp.asarray(clf.intercept_, dtype=cp.float32)
        
        # Native A100 GEMM (Fast Path) [Source 07]
        if W is not None and b is not None:
            scores = cp.dot(x_gpu, W.T if W.ndim > 1 else W) + b
            return 1.0 / (1.0 + cp.exp(-cp.clip(scores, -50.0, 50.0)))
        
        # Fallback
        return clf.predict_proba(x_gpu)

    # --- DATA LOADING ---
    FEAT_DIR = Path(WORK_ROOT) / 'features'
    PRED_DIR = FEAT_DIR / 'level1_preds'
    PRED_DIR.mkdir(parents=True, exist_ok=True)
    
    top_terms_path = FEAT_DIR / 'top_terms_13500.json'
    if not top_terms_path.exists():
        raise FileNotFoundError(f"Missing {top_terms_path}. Run Cell 13a first.")
    top_terms = json.loads(top_terms_path.read_text())

    # --- [FROM SOURCE 53-54] LOAD IA WEIGHTS ---
    weights_full = None
    try:
        ia_path = next((p for p in [Path(WORK_ROOT)/'IA.tsv', FEAT_DIR/'IA.tsv'] if p.exists()), None)
        if ia_path:
            ia_df = pd.read_csv(ia_path, sep='\t', header=None, names=['term', 'ia'])
            ia_map = dict(zip(ia_df['term'].astype(str), ia_df['ia'].astype(np.float32)))
            # Map weights to our master top_terms list
            weights_full = np.asarray([ia_map.get(t, 1.0) for t in top_terms], dtype=np.float32)
            _stage(f"[RANK 1] Loaded IA weights from {ia_path}")
        else:
             _stage(f"[WARN] IA.tsv not found. Using default weights (1.0).")
             weights_full = np.ones(len(top_terms), dtype=np.float32)
    except Exception as e:
        _stage(f"[CRITICAL] IA weights required for Rank 1 F1 boost: {e}")
        weights_full = np.ones(len(top_terms), dtype=np.float32)
    
    # X Loading (Memmap)
    x_path = FEAT_DIR / 'X_train_mmap.npy'
    xt_path = FEAT_DIR / 'X_test_mmap.npy'
    if not x_path.exists():
        raise FileNotFoundError(f"Missing {x_path}. Run Cell 13a first to build memmaps.")
    
    X = np.load(x_path, mmap_mode='r')
    X_test_ram = np.ascontiguousarray(np.load(xt_path, mmap_mode='r'), dtype=np.float32)
    
    # Y Loading (Target Matrix)
    y_path = FEAT_DIR / 'Y_target_13500.npy'
    if y_path.exists():
        print(f"Loading targets from {y_path}...")
        Y_full = np.load(y_path, mmap_mode='r')
    else:
        print(f"Building targets {y_path} (first run)...")
        # Reconstruct Y if missing (Standalone robustness)
        train_terms = pd.read_parquet(WORK_ROOT / 'parsed' / 'train_terms.parquet')
        train_ids = pd.read_feather(WORK_ROOT / 'parsed' / 'train_seq.feather')['id'].astype(str)
        # Clean IDs
        train_ids_clean = train_ids.str.extract(r"\|(.*?)\|")[0].fillna(train_ids)
        
        train_terms_top = train_terms[train_terms["term"].isin(top_terms)]
        Y_df = train_terms_top.pivot_table(index="EntryID", columns="term", aggfunc="size", fill_value=0)
        Y_df = Y_df.reindex(train_ids_clean, fill_value=0)
        Y_full_ram = Y_df.values.astype(np.float32)
        
        np.save(y_path, Y_full_ram)
        Y_full = np.load(y_path, mmap_mode='r')
        print(f"Saved {y_path}")

    # Runtime Knobs [Source 62]
    TARGET_CHUNK = 125 # Reduced from 250 to prevent cuBLAS handle exhaustion
    VAL_BS, TEST_BS, n_splits = 4096, 8192, 5
    aspects = [(os.environ.get('TARGET_ASPECT') or 'MF').upper()]
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_pred_logreg_by_aspect = {}
    aspect_indices_map = {}

    for aspect in aspects:
        _stage(f"\n=== LogReg Aspect: {aspect} ===")
        aspect_indices = np.array([i for i, t in enumerate(top_terms) if _aspect_of_term(t) == aspect])
        if len(aspect_indices) == 0:
            print(f"No terms found for aspect {aspect}. Check OBO loading.")
            continue
            
        aspect_indices_map[aspect] = aspect_indices
        n_targets = len(aspect_indices)
        print(f"Training {n_targets} targets for {aspect}...")

        lr_oof_path = PRED_DIR / f'oof_pred_logreg_{aspect}.npy'
        lr_test_path = PRED_DIR / f'test_pred_logreg_{aspect}.npy'
        
        # Initialize memmaps
        if not lr_oof_path.exists():
            np.save(lr_oof_path, np.zeros((X.shape[0], n_targets), dtype=np.float32))
        if not lr_test_path.exists():
            np.save(lr_test_path, np.zeros((X_test_ram.shape[0], n_targets), dtype=np.float32))
            
        oof_pred = np.lib.format.open_memmap(str(lr_oof_path), mode='r+', dtype=np.float32, shape=(X.shape[0], n_targets))
        test_pred = np.lib.format.open_memmap(str(lr_test_path), mode='r+', dtype=np.float32, shape=(X_test_ram.shape[0], n_targets))

        for fold, (idx_tr, idx_val) in enumerate(kf.split(np.arange(X.shape[0]))):
            _stage(f"{aspect} Fold {fold+1}/{n_splits}")
            scaler = StandardScaler()
            X_tr_gpu = cp.asarray(scaler.fit_transform(X[idx_tr]), dtype=cp.float32)
            X_val_gpu = cp.asarray(scaler.transform(X[idx_val]), dtype=cp.float32)
            X_test_gpu = cp.asarray(scaler.transform(X_test_ram), dtype=cp.float32)

            for start in tqdm(range(0, n_targets, TARGET_CHUNK), desc=f"Chunks"):
                end = min(start + TARGET_CHUNK, n_targets)
                cols = aspect_indices[start:end]
                chunk_width = end - start
                
                # [HEARTBEAT START]
                print(f"    [Start] {aspect} Fold {fold+1} | Terms {start}-{end} | {time.strftime('%H:%M:%S')}")
                sys.stdout.flush()

                # --- [RANK 1 REQUIREMENT] IA-WEIGHTED CLASS WEIGHTS [Source 139, 475, 488] ---
                # Slicing IA weights for the current chunk of GO terms
                weights_chunk = weights_full[cols]
                
                # cuML requires a dictionary mapping class indices to weights for class_weight.
                # For rare term prioritization, we pass the IA value as the weight for the positive class.
                # Note: In Multi-label OVR, this prioritizes the '1' label for high-IA terms.
                cw_dict = {i: float(weights_chunk[i]) for i in range(chunk_width)}

                # Initialize cuLogReg with IA-weighting for Rank 1 precision [Source 136, 488]
                clf_chunk = cuOVR(cuLogReg(
                    solver='qn', 
                    max_iter=1000, 
                    tol=1e-2, 
                    class_weight=cw_dict # <--- IA-WEIGHTING INTEGRATED HERE
                ))
                
                Y_tr_chunk = cp.asarray(Y_full[np.ix_(idx_tr, cols)])
                
                # --- FIT TIMING ---
                t_fit_start = time.time()
                clf_chunk.fit(X_tr_gpu, Y_tr_chunk)
                t_fit_end = time.time()

                # --- [PHASE 3 READINESS] GENERATING OOF FEATURES [Source 137, 491] ---
                # These OOF probabilities replace raw embeddings as primary features for GCNs.
                val_gpu_buffer = cp.zeros((len(idx_val), chunk_width), dtype=cp.float32)
                for b0 in range(0, len(idx_val), VAL_BS):
                    b1 = min(b0 + VAL_BS, len(idx_val))
                    val_gpu_buffer[b0:b1, :] = safe_predict_proba_gpu(clf_chunk, X_val_gpu[b0:b1])
                
                # Store directly into memmap for GCN Stacker consumption
                oof_pred[idx_val, start:end] = val_gpu_buffer.get() 

                # Generate Test Features for GCN Inference
                test_gpu_buffer = cp.zeros((X_test_ram.shape[0], chunk_width), dtype=cp.float32)
                for b0 in range(0, X_test_ram.shape[0], TEST_BS):
                    b1 = min(b0 + TEST_BS, X_test_ram.shape[0])
                    test_gpu_buffer[b0:b1, :] = safe_predict_proba_gpu(clf_chunk, X_test_gpu[b0:b1])
                
                test_pred[:, start:end] += (test_gpu_buffer.get() / n_splits)
                t_inf_end = time.time()

                # --- FIX: Aggressive GPU cleanup to prevent cuBLAS handle exhaustion ---
                del clf_chunk, Y_tr_chunk, val_gpu_buffer, test_gpu_buffer
                cp.get_default_memory_pool().free_all_blocks()
                cp.get_default_pinned_memory_pool().free_all_blocks()
                
                # [HEARTBEAT & METRICS]
                dur_fit = t_fit_end - t_fit_start
                dur_inf = t_inf_end - t_fit_end
                mem_used_gb = cp.get_default_memory_pool().used_bytes() / 1e9
                print(f"    [Step] {aspect} Fold {fold+1} | Terms {start:>5}-{end:<5} | "
                      f"Fit: {dur_fit:>5.1f}s | Inf: {dur_inf:>5.1f}s | "
                      f"Mem: {mem_used_gb:.1f}GB | {time.strftime('%H:%M:%S')}")
                sys.stdout.flush()

        oof_pred.flush(); test_pred.flush()
        oof_pred_logreg_by_aspect[aspect] = np.load(lr_oof_path, mmap_mode='r')

    # --- RESTORED: STAGE MAYBE PUSH [Source 103-105] ---
    if 'STORE' in globals() and STORE is not None:
        _stage("[STORE] Executing Stage Maybe push for Level 1 LogReg...")
        required = [str(top_terms_path.as_posix())]
        for asp in oof_pred_logreg_by_aspect.keys():
            required += [
                str((PRED_DIR / f'oof_pred_logreg_{asp}.npy').as_posix()),
                str((PRED_DIR / f'test_pred_logreg_{asp}.npy').as_posix()),
                str((PRED_DIR / f'top_terms_{asp}.json').as_posix()),
            ]
        try:
            STORE.maybe_push(
                stage='stage_07a_level1_logreg_aspect_split',
                required_paths=required,
                note='Rank 1 Optimized LogReg Predictions (A100 Fast Path).'
            )
        except Exception as e:
            _stage(f"[WARN] Store push failed: {e}")