In [None]:
!pip -q install optuna
!pip install optuna-integration

In [None]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
import optuna
import gc

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from optuna.integration import TFKerasPruningCallback
from sklearn.metrics import f1_score
from tensorflow.keras import backend as K


In [None]:
from pathlib import Path

FASTA_PATH = Path("/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta")
TAX_PATH   = Path("/kaggle/input/cafa-6-protein-function-prediction/Train/train_taxonomy.tsv")   # not used here, kept for reference
TRAIN_PATH = Path("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv")

# Output directory & files
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_TSV = OUT_DIR / "features_propy.tsv"
OUT_PQ  = OUT_DIR / "features_propy.parquet"



In [None]:
AA_STD = set("ACDEFGHIKLMNPQRSTVWY")

GROUPS = {
    # === Basic (classic physicochemical) ===
    "AROMATIC":       set("FWY"),
    "POSITIVE":       set("KRH"),
    "NEGATIVE":       set("DE"),
    "POLAR":          set("STNQYC"),
    "NON_POLAR":      set("AVLIMFWP"),
    "HYDROPHOBIC":    set("AILMFWV"),
    "ALIPHATIC":      set("AILV"),
    "SMALL":          set("AGSV"),
    "PROLINE":        set("P"),

    # === Physicochemical / functional ===
    "SULFUR":         set("CM"),
    "BASIC":          set("KRH"),          # alias of POSITIVE
    "ACIDIC":         set("DE"),           # alias of NEGATIVE
    "SMALL_FLEX":     set("AGST"),
    "BULKY":          set("WFRYK"),
    "TURN_PRONE":     set("GPND"),
    "FLEXIBLE":       set("GSDN"),
    "RIGID":          set("CWYF"),

    # === Structural propensities ===
    "HELIX_FAVORING": set("ALMQEKR"),      # α-helix promoters
    "SHEET_FAVORING": set("VIFYTW"),       # β-sheet promoters
    "HELIX_BREAKERS": set("PG"),           # α-helix breakers
    "SHEET_BREAKERS": set("DEKR"),         # β-sheet breakers
    "TURN_FAVORING":  set("GPNDS"),        # β-turn prone
    "HELIX_CAPPERS":  set("NDST"),         # often found at helix termini
    "BETA_BRIDGING":  set("CFYW"),         # can bridge β-strands

    # === Size / volume ===
    "TINY":           set("AGSC"),
    "SMALL_SIZE":     set("AGSVTP"),
    "MEDIUM_SIZE":    set("NDQEC"),
    "LARGE_SIZE":     set("WFYRKH"),
    "BULKY_SIZE":     set("WFYRK"),

    # === Hydrophobicity ===
    "HYDROPHOBIC_STRONG":     set("ILVFWCM"),
    "HYDROPHILIC_STRONG":     set("DEKRNHQ"),
    "MODERATELY_HYDROPHOBIC": set("ATY"),
    "MODERATELY_HYDROPHILIC": set("SGP"),

    # === Accessibility and structural position ===
    "SURFACE_PRONE":  set("DEKNRQHSTY"),   # typically surface-exposed residues
    "CORE_PRONE":     set("AILMVFWY"),     # typically buried residues

    # === Specific chemical groups ===
    "HYDROXYL":       set("STY"),
    "AMIDE":          set("NQ"),
    "CATIONIC":       set("KRH"),
    "ANIONIC":        set("DE"),
    "NEUTRAL":        set("ACFGILMNPQSTVWY"),

    # === Functional / biological groups ===
    "METAL_BINDING":         set("CHDE"),
    "PHOSPHORYLATION_SITES": set("STY"),
    "GLYCOSYLATION_SITES":   set("NST"),
    "DISULFIDE_FORMING":     set("C"),
    "ZINC_FINGER_CORE":      set("CH"),

    # === Evolutionary / conservative properties ===
    "HYDROPHOBIC_SET":      set("AVLIMFWY"),
    "POLAR_UNCHARGED_SET":  set("STNQC"),
    "CHARGED_SET":          set("DEKR"),
    "SMALL_FLEXIBLE_SET":   set("AGP"),
}


In [None]:
def parse_fasta_idseq(path: Path) -> dict:
    """
    Parse a FASTA file and return {EntryID: cleaned_sequence}.
    - Handles UniProt headers like 'sp|/tr|ACC|...'
    - Keeps only the 20 standard amino acids
    """
    if not path.exists():
        raise FileNotFoundError(f"FASTA not found: {path}")
    seqs = {}
    pid = None
    with open(path, encoding="utf-8") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith(">"):
                header = line[1:].strip()
                m = re.match(r"(?:sp|tr)\|([^|]+)\|", header)
                pid = m.group(1) if m else header.split()[0].split("|")[0]
                seqs[pid] = ""
            else:
                seqs[pid] += "".join(c for c in line if c in AA_STD)
    return seqs

def frac_and_count(seq: str, aa_set: set) -> tuple[float, int]:
    n = len(seq)
    if n == 0:
        return 0.0, 0
    cnt = sum(1 for c in seq if c in aa_set)
    return cnt / n, cnt

def build_group_features(
    fasta_path: Path,
    groups: dict[str, set],
    include_fractions: bool = True,
    include_counts: bool = True,
) -> pd.DataFrame:
    """
    Compute per-sequence length + group fractions and/or counts.
    Output columns: len, <group>_frac, <group>_count
    """
    seqs = parse_fasta_idseq(fasta_path)
    rows = []
    for pid, seq in seqs.items():
        row = {"EntryID": pid, "len": len(seq)}
        for gname, gset in groups.items():
            f, c = frac_and_count(seq, gset)
            if include_fractions:
                row[f"{gname.lower()}_frac"] = f
            if include_counts:
                row[f"{gname.lower()}_count"] = c
        rows.append(row)
    df = pd.DataFrame(rows).set_index("EntryID").sort_index()
    return df

def safe_read_terms(train_path: Path) -> pd.DataFrame:
    """
    Read term annotations and normalize column names to: EntryID, term.
    Adjust here if your file uses different names.
    """
    df = pd.read_csv(train_path, sep="\t")
    low = {c.lower(): c for c in df.columns}

    # normalize EntryID
    if "entryid" in low and low["entryid"] != "EntryID":
        df.rename(columns={low["entryid"]: "EntryID"}, inplace=True)
    elif "uniprot" in low:
        df.rename(columns={low["uniprot"]: "EntryID"}, inplace=True)

    # normalize term
    if "term" in low and low["term"] != "term":
        df.rename(columns={low["term"]: "term"}, inplace=True)
    elif "goterm" in low:
        df.rename(columns={low["goterm"]: "term"}, inplace=True)

    return df


In [None]:
# Build features
df_groups = build_group_features(
    FASTA_PATH,
    GROUPS,
    include_fractions=True,
    include_counts=True,
)

# Save outputs
df_groups.to_csv(OUT_TSV, sep="\t")
print(f"Saved TSV: {OUT_TSV.resolve()} | shape={df_groups.shape}")

try:
    df_groups.to_parquet(OUT_PQ, index=True)
    print(f"Saved Parquet: {OUT_PQ.resolve()}")
except Exception as e:
    print(f"Parquet not written: {e}")

df_groups.head()


In [None]:
# Merge with GO terms (optional but useful for analysis)
df_merged = None
if TRAIN_PATH.exists():
    df_terms = safe_read_terms(TRAIN_PATH)
    if "EntryID" in df_terms.columns:
        df_merged = (
            df_groups.reset_index()
                     .merge(df_terms, on="EntryID", how="left")
                     .set_index("EntryID")
        )
        print("Merge completed:", df_merged.shape)
    else:
        print("Column 'EntryID' not found in train_terms.tsv — skipping merge.")
else:
    print("TRAIN_PATH not found — skipping merge.")

df_merged.head() if df_merged is not None else None



In [None]:
# Compute per-term statistics (mean, std, count) for all feature columns
if df_merged is not None and "term" in df_merged.columns:
    feature_cols = [c for c in df_merged.columns if c.endswith("_frac") or c.endswith("_count") or c == "len"]
    print(f"Total feature columns: {len(feature_cols)}")

    term_stats = (
        df_merged.groupby("term")[feature_cols]
                 .agg(["mean", "std", "count"])
                 .reset_index()
    )

    # Flatten MultiIndex columns
    term_stats.columns = [f"{a}_{b}" if b else a for a, b in term_stats.columns]
    print("Computed term-level stats:", term_stats.shape)
    term_stats.head()
else:
    print("Skipping: df_merged or 'term' not available.")


In [None]:
# Heatmap of Z-scores (means vs global means) for top N frequent GO terms
if df_merged is not None and "term" in df_merged.columns:
    import matplotlib.pyplot as plt
    import seaborn as sns

    feature_cols = [c for c in df_merged.columns if c.endswith("_frac") or c.endswith("_count") or c == "len"]

    # Global stats
    global_means = df_merged[feature_cols].mean()
    global_stds  = df_merged[feature_cols].std().replace(0, np.nan)

    # Build term-level means matrix
    means = (
        df_merged.groupby("term")[feature_cols]
                 .mean()
    )

    # Z = (mean_term - mean_global) / std_global
    Z = (means - global_means) / global_stds

    # Pick top N terms by frequency
    top_terms = df_merged["term"].value_counts().head(15).index
    Z_sub = Z.loc[Z.index.intersection(top_terms)]

    plt.figure(figsize=(14, 8))
    sns.heatmap(Z_sub, cmap="coolwarm", center=0)
    plt.title("Feature Z-scores by GO term (Top 15 by frequency)")
    plt.xlabel("Feature")
    plt.ylabel("GO Term")
    plt.tight_layout()
    plt.show()
else:
    print("Skipping heatmap: df_merged or 'term' not available.")


In [None]:
TEST_FASTA = Path("/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta")

OUT_DIR = Path("outputs")
OUT_DIR.mkdir(exist_ok=True, parents=True)
SUB_PATH = OUT_DIR / "submission_cafa6.tsv"

In [None]:
# Read training annotations; expect columns: EntryID, term
df_terms = pd.read_csv(TRAIN_PATH, sep="\t")
assert {"EntryID", "term"}.issubset(df_terms.columns)

# Ensure every training EntryID in df_groups has a row in the label table (multi-label)
# 1) Collect terms per EntryID
terms_by_id = df_terms.groupby("EntryID")["term"].apply(lambda s: sorted(set(s))).reindex(df_groups.index, fill_value=[])

# 2) Build vocabulary of GO terms from training
go_terms = sorted(df_terms["term"].unique())
term_to_idx = {t:i for i,t in enumerate(go_terms)}

# 3) Build Y matrix (multi-hot)
Y = np.zeros((len(df_groups), len(go_terms)), dtype=np.float32)
for r, entry in enumerate(df_groups.index):
    for t in terms_by_id.loc[entry]:
        Y[r, term_to_idx[t]] = 1.0

# 4) Feature matrix X
feature_cols = [c for c in df_groups.columns if c.endswith("_frac") or c.endswith("_count") or c == "len"]
X = df_groups[feature_cols].astype(np.float32).values

X.shape, Y.shape, len(go_terms)


In [None]:
# Read training annotations; expect columns: EntryID, term
df_terms = pd.read_csv(TRAIN_PATH, sep="\t")
assert {"EntryID", "term"}.issubset(df_terms.columns)

# Ensure every training EntryID in df_groups has a row in the label table (multi-label)
# 1) Collect terms per EntryID
terms_by_id = df_terms.groupby("EntryID")["term"].apply(lambda s: sorted(set(s))).reindex(df_groups.index, fill_value=[])

# 2) Build vocabulary of GO terms from training
go_terms = sorted(df_terms["term"].unique())
term_to_idx = {t:i for i,t in enumerate(go_terms)}

# 3) Build Y matrix (multi-hot)
Y = np.zeros((len(df_groups), len(go_terms)), dtype=np.float32)
for r, entry in enumerate(df_groups.index):
    for t in terms_by_id.loc[entry]:
        Y[r, term_to_idx[t]] = 1.0

# 4) Feature matrix X
feature_cols = [c for c in df_groups.columns if c.endswith("_frac") or c.endswith("_count") or c == "len"]
X = df_groups[feature_cols].astype(np.float32).values

X.shape, Y.shape, len(go_terms)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Standardize features (helps NN training)
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X)

X_tr, X_va, Y_tr, Y_va = train_test_split(
    X_scaled, Y, test_size=0.15, random_state=42, stratify=(Y.sum(axis=1) > 0)
)

X_tr.shape, X_va.shape, Y_tr.shape, Y_va.shape


In [None]:
'''
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

n_in  = X_tr.shape[1]
n_out = Y_tr.shape[1]

def make_model(n_in, n_out, hidden=256, dropout=0.25):
    inp = keras.Input(shape=(n_in,))
    x = layers.BatchNormalization()(inp)
    x = layers.Dense(hidden, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(hidden//2, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    out = layers.Dense(n_out, activation="sigmoid")(x)  # one prob per GO term
    model = keras.Model(inp, out)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(curve="PR", multi_label=True, num_labels=n_out, name="PR-AUC")]
    )
    return model

model = make_model(n_in, n_out)
cb = [
    keras.callbacks.EarlyStopping(monitor="val_PR-AUC", mode="max", patience=5, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_PR-AUC", mode="max", factor=0.5, patience=2, min_lr=1e-5),
]
hist = model.fit(
    X_tr, Y_tr,
    validation_data=(X_va, Y_va),
    epochs=50,
    batch_size=512,
    callbacks=cb,
    verbose=2
)

from sklearn.metrics import f1_score

Y_va_pred = (model.predict(X_va, batch_size=1024) > 0.2).astype(int)  # threshold can be tuned
micro_f1 = f1_score(Y_va.ravel(), Y_va_pred.ravel(), average="binary")
print("Validation micro-F1 (th=0.2):", round(micro_f1, 4))
'''

# safe evaluation helpers

In [None]:
def predict_in_batches(model, X, batch_size=1024):
    """Predict in batches to reduce memory footprint."""
    n = X.shape[0]
    outs = []
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        outs.append(model.predict(X[start:end], batch_size=batch_size, verbose=0))
    return np.vstack(outs)

def best_threshold_by_f1(y_true, y_prob, thresholds=None, batch_size=1024):
    """
    Find the probability threshold in [0,1] that maximizes micro-F1.
    y_true: (N, C) binary
    y_prob: (N, C) float in [0,1]
    thresholds: list or np.array of thresholds to scan
    """
    if thresholds is None:
        thresholds = np.concatenate([
            np.linspace(0.02, 0.5, 25),
            np.linspace(0.5, 0.9, 9),
            np.linspace(0.9, 0.99, 10)
        ])
    best_f1, best_t = -1.0, 0.5
    y_true_flat = y_true.ravel()
    for t in thresholds:
        y_pred = (y_prob >= t).astype(np.uint8)
        f1 = f1_score(y_true_flat, y_pred.ravel(), average="binary", zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, float(t)
    return best_t, best_f1


# Objective for Optuna

In [None]:
def build_model(n_in, n_out, params):
    """
    Build a Keras multilabel model using sampled hyperparameters.
    params keys:
      - n_layers: 1..3
      - hidden: 128..1024
      - dropout: 0.0..0.6
      - l2: 0..1e-3
      - lr: 1e-4..5e-3
      - bn_first: True/False
      - activation: 'relu' or 'gelu' (via tf.nn.gelu)
    """
    inp = keras.Input(shape=(n_in,))
    x = inp
    if params["bn_first"]:
        x = layers.BatchNormalization()(x)

    for li in range(params["n_layers"]):
        x = layers.Dense(
            params["hidden"] if li == 0 else max(params["hidden"] // 2, 64),
            activation=None,
            kernel_regularizer=regularizers.l2(params["l2"]) if params["l2"] > 0 else None,
        )(x)
        x = layers.Activation(tf.nn.gelu if params["activation"] == "gelu" else "relu")(x)
        if params["dropout"] > 0:
            x = layers.Dropout(params["dropout"])(x)

    out = layers.Dense(n_out, activation="sigmoid")(x)
    model = keras.Model(inp, out)

    opt = keras.optimizers.Adam(learning_rate=params["lr"])
    # PR-AUC multi_label=True works in TF 2.5+; specify num_labels for clarity
    pr_auc = keras.metrics.AUC(curve="PR", multi_label=True, num_labels=n_out, name="PR-AUC")
    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=[pr_auc])
    return model


def objective(trial: optuna.Trial):
    # Use globals prepared earlier
    global X_tr, Y_tr, X_va, Y_va

    n_in = X_tr.shape[1]
    n_out = Y_tr.shape[1]

    params = {
        "n_layers":  trial.suggest_int("n_layers", 1, 3),
        "hidden": trial.suggest_categorical("hidden", [128, 256, 384]),
        "dropout":   trial.suggest_float("dropout", 0.0, 0.6),
        "l2":        trial.suggest_float("l2", 1e-8, 1e-3, log=True),  # << fix
        "lr":        trial.suggest_float("lr", 1e-4, 5e-3, log=True),
        "bn_first":  trial.suggest_categorical("bn_first", [True, False]),
        "activation":trial.suggest_categorical("activation", ["relu", "gelu"]),
        "batch_size": trial.suggest_categorical("batch_size", [128, 256]),
        "epochs":    trial.suggest_int("epochs", 10, 60),
        "patience":  trial.suggest_int("patience", 3, 8),
    }


    model = build_model(n_in, n_out, params)

    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor="val_PR-AUC",
            mode="max",
            patience=params["patience"],
            restore_best_weights=True
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor="val_PR-AUC",
            mode="max",
            factor=0.5,
            patience=max(1, params["patience"] // 2),
            min_lr=1e-5
        ),
        TFKerasPruningCallback(trial, monitor="val_PR-AUC"),
    ]

    history = model.fit(
        X_tr, Y_tr,
        validation_data=(X_va, Y_va),
        epochs=params["epochs"],
        batch_size=params["batch_size"],
        verbose=0,
        callbacks=callbacks
    )

    # Evaluate PR-AUC on validation set (higher is better)
    # model.evaluate returns [loss, PR-AUC]
    _, val_pr_auc = model.evaluate(X_va, Y_va, batch_size=1024, verbose=0)

    # cleanup to release GPU memory
    del model
    K.clear_session()
    gc.collect()

    # Report to Optuna
    return float(val_pr_auc)


# Run the study

In [None]:
# Optional: make TF use memory growth to reduce OOM risk
try:
    gpus = tf.config.list_physical_devices('GPU')
    for g in gpus:
        tf.config.experimental.set_memory_growth(g, True)
except Exception as e:
    print(f"GPU memory growth not set: {e}")

study_name = "keras_cafa6_pr_auc"
storage = None  # e.g., "sqlite:///optuna.db" if you want persistence

pruner = optuna.pruners.MedianPruner(n_warmup_steps=3)
sampler = optuna.samplers.TPESampler(seed=42, multivariate=True)

study = optuna.create_study(
    study_name=study_name,
    direction="maximize",
    sampler=sampler,
    pruner=pruner,
    storage=storage,
    load_if_exists=False
)

N_TRIALS = 8  
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)




In [None]:
import json

print("Best value (val PR-AUC):", study.best_value)
print("Best trial params:")
print(json.dumps(study.best_trial.params, indent=2))

# Retrain best model, find best threshold on validation

In [None]:
best_params = study.best_trial.params
n_in, n_out = X_tr.shape[1], Y_tr.shape[1]

best_model = build_model(n_in, n_out, best_params)

callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_PR-AUC", mode="max",
        patience=best_params.get("patience", 5),
        restore_best_weights=True
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_PR-AUC", mode="max",
        factor=0.5,
        patience=max(1, best_params.get("patience", 5) // 2),
        min_lr=1e-5
    ),
]

history = best_model.fit(
    X_tr, Y_tr,
    validation_data=(X_va, Y_va),
    epochs=best_params.get("epochs", 30),
    batch_size=best_params.get("batch_size", 512),
    verbose=2,
    callbacks=callbacks
)



model = best_model

In [None]:
# Build group features for the test set; reuse your helper
df_test = build_group_features(TEST_FASTA, GROUPS, include_fractions=True, include_counts=True)

# Transform with the fitted scaler and select the same feature columns
X_test = df_test[feature_cols].astype(np.float32).values
X_test_scaled = scaler.transform(X_test)

df_test.shape



In [None]:
model = best_model

# Save model, scaler, and metadata

In [None]:
# --- SAVE MODEL, SCALER, AND TEST DATA --------------------------------------

from pathlib import Path
import joblib
import json
import numpy as np
import pandas as pd

SAVE_DIR = Path("outputs/checkpoints")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# 1) Save Keras model
model_path = SAVE_DIR / "cafa6_model.h5"
model.save(model_path)
print(f"Saved model: {model_path.resolve()}")

# 2) Save fitted StandardScaler
scaler_path = SAVE_DIR / "scaler.joblib"
joblib.dump(scaler, scaler_path)
print(f"Saved scaler: {scaler_path.resolve()}")

# 3) Save metadata (feature names, GO terms, and amino acid groups)
meta = {
    "feature_cols": feature_cols,
    "go_terms": go_terms,
    "term_to_idx": term_to_idx,
    "idx_to_term": go_terms,
    "groups": {k: sorted(list(v)) for k, v in GROUPS.items()},
}
meta_path = SAVE_DIR / "metadata.json"
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)
print(f"Saved metadata: {meta_path.resolve()}")

# 4) Save test feature matrices and dataframe
np.save(SAVE_DIR / "X_test.npy", X_test)
np.save(SAVE_DIR / "X_test_scaled.npy", X_test_scaled)
df_test.to_csv(SAVE_DIR / "df_test.tsv", sep="\t")
print("Saved X_test, X_test_scaled, and df_test.")

print("\nAll training artifacts and test features have been saved successfully.")


# Load model, scaler, metadata, and test data

In [None]:
# --- LOAD MODEL, SCALER, METADATA, AND TEST DATA ----------------------------

from pathlib import Path
import joblib
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

LOAD_DIR = Path("outputs/checkpoints")

# 1) Load Keras model
model = keras.models.load_model(LOAD_DIR / "cafa6_model.h5")
print("Model loaded.")

# 2) Load fitted StandardScaler
scaler = joblib.load(LOAD_DIR / "scaler.joblib")
print("Scaler loaded.")

# 3) Load metadata
with open(LOAD_DIR / "metadata.json") as f:
    meta = json.load(f)

feature_cols = meta["feature_cols"]
go_terms = meta["go_terms"]
term_to_idx = meta["term_to_idx"]
GROUPS = {k: set(v) for k, v in meta["groups"].items()}

print("Metadata loaded.")

# 4) Load test matrices and dataframe
X_test = np.load(LOAD_DIR / "X_test.npy")
X_test_scaled = np.load(LOAD_DIR / "X_test_scaled.npy")
df_test = pd.read_csv(LOAD_DIR / "df_test.tsv", sep="\t", index_col=0)

print("Test data loaded.")
print(f"X_test: {X_test.shape}, X_test_scaled: {X_test_scaled.shape}, df_test: {df_test.shape}")
print("All components successfully reloaded. Ready for inference.")


In [None]:
# --- PREDICT ON TEST IN STREAMING, POST-PROCESS, WRITE TSV ON THE FLY --------
from pathlib import Path
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

# (Optional) enable GPU memory growth to avoid pre-allocating all VRAM
try:
    gpus = tf.config.list_physical_devices('GPU')
    for g in gpus:
        tf.config.experimental.set_memory_growth(g, True)
except Exception as e:
    print(f"GPU memory growth not set: {e}")

# 0) Required globals and paths
OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
SUB_PATH =  Path("submission.tsv")        # no header, tab-separated

required = ["model", "df_test", "X_test", "X_test_scaled", "feature_cols", "go_terms"]
missing = [k for k in required if k not in globals()]
if missing:
    raise RuntimeError(f"Missing required variables: {missing}. Ensure training/feature cells ran.")

# 1) Sanity checks and shape guards
assert all(c in df_test.columns for c in feature_cols), "Feature columns mismatch between train and test."
X_test = np.asarray(X_test, dtype=np.float32, order="C")
X_test_scaled = np.asarray(X_test_scaled, dtype=np.float32, order="C")
assert X_test.shape == X_test_scaled.shape, "X_test and X_test_scaled must have identical shapes."
n_test, n_feat = X_test_scaled.shape
go_terms = list(go_terms)              # ensure list
idx_to_term = np.asarray(go_terms, dtype=str)
n_terms = len(idx_to_term)
test_ids = df_test.index.astype(str).to_numpy()
assert len(test_ids) == n_test, "df_test index length must match X_test rows."

# 2) CAFA post-processing parameters
THRESH = 0.1                  # drop probabilities <= threshold to avoid zeros
TOP_K = None                  # set e.g. 500 to pre-cap per target; None keeps all above THRESH
MAX_PAIRS_PER_TARGET = 1500   # global cap per target across MF/BP/CC
BATCH = 128                   # reduce if you still hit OOM

# 3) Streamed inference and direct writing to disk
with SUB_PATH.open("w", encoding="utf-8") as fout:
    for start in range(0, n_test, BATCH):
        end = min(start + BATCH, n_test)
        xb = X_test_scaled[start:end]                            # view without extra copy
        # Predict a small batch to limit RAM/VRAM usage
        preds = model.predict(xb, batch_size=BATCH, verbose=0)  # shape (batch, n_terms)
        if preds.dtype != np.float32:
            preds = preds.astype(np.float32, copy=False)
        # Process each row and emit lines directly
        for i in range(end - start):
            pid = test_ids[start + i]
            probs = np.nan_to_num(preds[i], nan=0.0)          # guard against NaNs
            mask = probs > THRESH
            if not np.any(mask):
                # If a target is not listed, evaluators assume zero for all terms
                continue
            sel_terms = idx_to_term[mask]
            sel_probs = probs[mask]
            order = np.argsort(-sel_probs)                    # descending by prob
            if TOP_K is not None and len(order) > TOP_K:
                order = order[:TOP_K]
            # Enforce CAFA global cap per target
            order = order[:min(len(order), MAX_PAIRS_PER_TARGET)]
            # Write lines: target \t GO \t score (3 significant figures, (0,1])
            for j in order:
                p = float(sel_probs[j])
                if p <= 0.0:
                    continue
                if p > 1.0:
                    p = 1.0
                fout.write(f"{pid}\t{sel_terms[j]}\t{p:.3g}\n")

print(f"Submission written to: {SUB_PATH.resolve()}")



In [None]:
model.summary()