# Notebook 03b - Multi-target Models for CLD (Early -> Late Prediction)

## Goal
Train separate regression models that predict late-stage outcomes using **early-only features**:
- Stability: productivity_drop_pct (lower is better)
- Productivity: late_mean_titer (higher is better)
- Quality: late_mean_aggregation (lower is better)

These predicted values will be used in Notebook 04 to perform **predicted-late-based clone selection**, wihch mirrors real CLD projects where late data is not available at decision time.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# Load dataset with three target

DATA_PATH = "/Users/sang/CLD_ML_PIPELINE/data/synthetic/processed/cld_features_with_labels_3targets_v2.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,qP_p10,titer_cv,vcd_cv,viability_cv,aggregation_cv,culture_mode_fed-batch,culture_mode_perfusion,productivity_drop_pct,late_mean_titer,late_mean_aggregation
0,CLONE_0001,2.665436,0.145412,2.464814,2.852368,10632900.0,1025472.0,9197038.0,12361790.0,93.637077,...,2.001998e-07,0.054555,0.096443,0.008769,0.071216,True,False,0.229719,2.053135,4.310553
1,CLONE_0002,0.834691,0.191151,0.516513,1.171273,15128100.0,597750.6,14076260.0,16051270.0,96.283457,...,5.065544e-08,0.229008,0.039513,0.013424,0.121568,True,False,0.356246,0.537335,3.259003
2,CLONE_0003,3.990484,0.175857,3.722491,4.270057,8411914.0,1150419.0,6047146.0,9506059.0,93.278459,...,4.301554e-07,0.044069,0.136761,0.016516,0.056506,True,False,0.281589,2.866808,5.945068
3,CLONE_0004,0.540821,0.154336,0.333873,0.749828,15112980.0,605067.5,14481560.0,16263420.0,96.187877,...,2.266833e-08,0.285374,0.040036,0.021262,0.036466,True,False,0.02616,0.526673,7.351199
4,CLONE_0005,2.16281,0.124723,1.928686,2.355251,11810710.0,732115.7,10921310.0,13285170.0,95.670482,...,1.899366e-07,0.057667,0.061987,0.014832,0.406579,True,False,0.382269,1.336034,1.133822


In [3]:
# Preapre X and 3 ys

targets = ["productivity_drop_pct", "late_mean_titer", "late_mean_aggregation"]

clone_ids = df["clone_id"].copy()

X = df.drop(columns=["clone_id"] + targets).copy()
X = X.fillna(X.median(numeric_only=True))

y_drop = df["productivity_drop_pct"].copy().clip(lower=0.0, upper=1.0)   # stability: clamp to [0,1]
y_titer = df["late_mean_titer"].copy()                                   # late productivity
y_agg = df["late_mean_aggregation"].copy().clip(lower=0.0, upper=100.0)  # quality proxy: clamp to [0,100]

print("X shape:", X.shape)
print("y_drop:", y_drop.shape, "y_titer:", y_titer.shape, "y_agg:", y_agg.shape)

X shape: (2000, 44)
y_drop: (2000,) y_titer: (2000,) y_agg: (2000,)


In [4]:
# Split train/test by clone_id to avoid data leakage

X_train, X_test, id_train, id_test = train_test_split(
    X, clone_ids, test_size=0.2, random_state=42
)

# Align y by index using clone_id (safer than relying on row order)
train_mask = df["clone_id"].isin(id_train)
test_mask  = df["clone_id"].isin(id_test)

y_drop_train, y_drop_test = y_drop[train_mask].values, y_drop[test_mask].values
y_titer_train, y_titer_test = y_titer[train_mask].values, y_titer[test_mask].values
y_agg_train, y_agg_test = y_agg[train_mask].values, y_agg[test_mask].values

print("Train size:", len(id_train), "Test size:", len(id_test))

Train size: 1600 Test size: 400


In [5]:
# Train 3 Random Forest models, one for each target

def fit_rf(X_train, y_train):
    model = RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        min_samples_leaf=5,
        max_features="sqrt"
    )
    model.fit(X_train, y_train)
    return model

rf_drop = fit_rf(X_train, y_drop_train)
rf_titer = fit_rf(X_train, y_titer_train)
rf_agg = fit_rf(X_train, y_agg_train)

In [6]:
# Evaluate models on test set

def eval_model(name, model, X_test, y_test):
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{name:18s}  MAE={mae:.4f}   R2={r2:.4f}")
    return pred

pred_drop = eval_model("drop (stability)", rf_drop, X_test, y_drop_test)
pred_titer = eval_model("late_titer", rf_titer, X_test, y_titer_test)
pred_agg = eval_model("late_agg", rf_agg, X_test, y_agg_test)

drop (stability)    MAE=0.1050   R2=-0.0321
late_titer          MAE=0.8622   R2=-0.0193
late_agg            MAE=2.2551   R2=-0.0314


In [7]:
# Save predictions to CSV

pred_table = pd.DataFrame({
    "clone_id": id_test.values,
    "true_drop": y_drop_test,
    "pred_drop": pred_drop,
    "true_late_titer": y_titer_test,
    "pred_late_titer": pred_titer,
    "true_late_agg": y_agg_test,
    "pred_late_agg": pred_agg
})

OUT_PRED = "../data/synthetic/processed/predictions_testset_3targets.csv"
pred_table.to_csv(OUT_PRED, index=False)
print("Saved:", OUT_PRED)

pred_table.head()

Saved: ../data/synthetic/processed/predictions_testset_3targets.csv


Unnamed: 0,clone_id,true_drop,pred_drop,true_late_titer,pred_late_titer,true_late_agg,pred_late_agg
0,CLONE_1861,0.413428,0.278215,0.87811,1.595899,3.939495,4.929371
1,CLONE_0354,0.352329,0.294602,1.120978,1.457162,4.282821,4.279169
2,CLONE_1334,0.198136,0.276187,1.417534,1.674085,4.19076,6.532866
3,CLONE_0906,0.150225,0.273525,0.874246,1.384532,5.308029,5.185922
4,CLONE_1290,0.26308,0.279548,2.080281,1.532157,6.445204,4.49797


In [8]:
# --- Spearman + TopK evaluation ---
import numpy as np
import pandas as pd

# If scipy is available, use it. If not, fallback to pandas rank corr.
try:
    from scipy.stats import spearmanr
    def spearman(a, b):
        return spearmanr(a, b).correlation
except Exception:
    def spearman(a, b):
        return pd.Series(a).rank().corr(pd.Series(b).rank())

def topk_overlap(true_scores, pred_scores, k):
    """
    Returns overlap fraction between true top-k and predicted top-k sets.
    """
    true_top = set(pd.Series(true_scores).nlargest(k).index)
    pred_top = set(pd.Series(pred_scores).nlargest(k).index)
    return len(true_top & pred_top) / k

def topk_recall_of_true_good(pred_rank, true_good_mask, k):
    """
    Among predicted top-k, what fraction are truly good?
    """
    top_idx = pred_rank.nlargest(k).index
    return true_good_mask.loc[top_idx].mean()

df_eval = pred_table.copy()

# 1) Spearman correlations (ranking quality)
# For drop: lower is better, so we correlate (-drop) to keep "higher is better" convention
print("=== Spearman (ranking correlation) ===")
print("drop (lower better):", spearman(-df_eval["true_drop"], -df_eval["pred_drop"]))
print("late_titer (higher better):", spearman(df_eval["true_late_titer"], df_eval["pred_late_titer"]))
print("late_agg (lower better):", spearman(-df_eval["true_late_agg"], -df_eval["pred_late_agg"]))

# 2) Define a "true goodness" utility for evaluation (synthetic world has true late)
# This is *evaluation only* (offline). In real projects we won't have true late at decision time.
def z(s):
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

# weights for evaluation utility (tune if you want)
A_TITER = 1.0
B_DROP  = 1.0
C_AGG   = 0.5

df_eval["true_utility"] = (
    A_TITER * z(df_eval["true_late_titer"])
    - B_DROP * z(df_eval["true_drop"])
    - C_AGG * z(df_eval["true_late_agg"])
)

df_eval["pred_utility"] = (
    A_TITER * z(df_eval["pred_late_titer"])
    - B_DROP * z(df_eval["pred_drop"])
    - C_AGG * z(df_eval["pred_late_agg"])
)

# 3) Top-K overlap and precision-like metrics for decisions
K_LIST = [5, 10, 20, 50]  # adjust to your CLD stage size

print("\n=== Top-K evaluation using utility ===")
for k in K_LIST:
    overlap = topk_overlap(df_eval["true_utility"], df_eval["pred_utility"], k)
    print(f"Top-{k} overlap (true vs pred utility): {overlap:.3f}")

# 4) Top-K "true-good" rate among predicted top-k
# Define "true good" clones as the top X% by true_utility (offline definition)
GOOD_FRAC = 0.10
good_threshold = df_eval["true_utility"].quantile(1 - GOOD_FRAC)
df_eval["true_good"] = df_eval["true_utility"] >= good_threshold

print("\n=== Top-K true-good rate among predicted picks ===")
pred_rank = df_eval["pred_utility"]
for k in K_LIST:
    rate = topk_recall_of_true_good(pred_rank, df_eval["true_good"], k)
    print(f"Predicted top-{k}: fraction that are truly top {int(GOOD_FRAC*100)}% good = {rate:.3f}")

=== Spearman (ranking correlation) ===
drop (lower better): -0.0822356640498116
late_titer (higher better): -0.09109125682035511
late_agg (lower better): -0.03314495715598222

=== Top-K evaluation using utility ===
Top-5 overlap (true vs pred utility): 0.000
Top-10 overlap (true vs pred utility): 0.000
Top-20 overlap (true vs pred utility): 0.050
Top-50 overlap (true vs pred utility): 0.040

=== Top-K true-good rate among predicted picks ===
Predicted top-5: fraction that are truly top 10% good = 0.200
Predicted top-10: fraction that are truly top 10% good = 0.100
Predicted top-20: fraction that are truly top 10% good = 0.050
Predicted top-50: fraction that are truly top 10% good = 0.040
