# Notebook 03b - Multi-target Models for CLD (Early -> Late Prediction)

## Goal
Train separate regression models that predict late-stage outcomes using **early-only features**:
- Stability: productivity_drop_pct (lower is better)
- Productivity: late_mean_titer (higher is better)
- Quality: late_mean_aggregation (lower is better)

These predicted values will be used in Notebook 04 to perform **predicted-late-based clone selection**, wihch mirrors real CLD projects where late data is not available at decision time.

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [12]:
# Load dataset with three target

DATA_PATH = "/Users/sang/CLD_ML_PIPELINE/data/synthetic/processed/cld_features_with_labels_3targets_v2_24_30.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,titer_cv,vcd_cv,viability_cv,aggregation_cv,culture_mode_fed-batch,culture_mode_perfusion,ddpcr_cn,productivity_drop_pct,late_mean_titer,late_mean_aggregation
0,CLONE_0001,3.135307,0.15306,2.908198,3.361966,11158540.0,775266.9,10096740.0,12294130.0,94.340916,...,0.048818,0.069477,0.013108,0.047273,False,True,2.0,0.271237,2.289753,4.902667
1,CLONE_0002,1.089709,0.201227,0.881955,1.476979,14533580.0,569550.1,13359300.0,15132320.0,96.108006,...,0.184661,0.039189,0.019557,0.096506,True,False,3.0,0.52492,0.581398,3.686371
2,CLONE_0003,4.715356,0.202982,4.36331,4.991713,9132412.0,799242.1,7744497.0,10047470.0,93.691616,...,0.043047,0.087517,0.020809,0.061906,True,False,2.0,0.338851,3.146742,6.581963
3,CLONE_0004,0.729517,0.140272,0.541439,0.88702,15322590.0,1022267.0,13804880.0,16854200.0,97.318163,...,0.192281,0.066716,0.013987,0.02734,True,False,2.0,0.646568,0.314086,8.374103
4,CLONE_0005,2.480311,0.215895,2.122646,2.781607,11696200.0,1088390.0,9654663.0,13619380.0,95.337131,...,0.087044,0.093055,0.020933,0.146547,True,False,3.0,0.492373,1.350038,1.890133


In [13]:
# Preapre X and 3 ys

targets = ["productivity_drop_pct", "late_mean_titer", "late_mean_aggregation"]

clone_ids = df["clone_id"].copy()

X = df.drop(columns=["clone_id"] + targets).copy()
X = X.fillna(X.median(numeric_only=True))

y_drop = df["productivity_drop_pct"].copy().clip(lower=0.0, upper=1.0)   # stability: clamp to [0,1]
y_titer = df["late_mean_titer"].copy()                                   # late productivity
y_agg = df["late_mean_aggregation"].copy().clip(lower=0.0, upper=100.0)  # quality proxy: clamp to [0,100]

print("X shape:", X.shape)
print("y_drop:", y_drop.shape, "y_titer:", y_titer.shape, "y_agg:", y_agg.shape)

X shape: (2000, 45)
y_drop: (2000,) y_titer: (2000,) y_agg: (2000,)


In [14]:
# Split train/test by clone_id to avoid data leakage

X_train, X_test, id_train, id_test = train_test_split(
    X, clone_ids, test_size=0.2, random_state=42
)

# Align y by index using clone_id (safer than relying on row order)
train_mask = df["clone_id"].isin(id_train)
test_mask  = df["clone_id"].isin(id_test)

y_drop_train, y_drop_test = y_drop[train_mask].values, y_drop[test_mask].values
y_titer_train, y_titer_test = y_titer[train_mask].values, y_titer[test_mask].values
y_agg_train, y_agg_test = y_agg[train_mask].values, y_agg[test_mask].values

print("Train size:", len(id_train), "Test size:", len(id_test))

Train size: 1600 Test size: 400


In [15]:
# Train 3 Random Forest models, one for each target

def fit_rf(X_train, y_train):
    model = RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        min_samples_leaf=5,
        max_features="sqrt"
    )
    model.fit(X_train, y_train)
    return model

rf_drop = fit_rf(X_train, y_drop_train)
rf_titer = fit_rf(X_train, y_titer_train)
rf_agg = fit_rf(X_train, y_agg_train)

In [16]:
# Evaluate models on test set

def eval_model(name, model, X_test, y_test):
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{name:18s}  MAE={mae:.4f}   R2={r2:.4f}")
    return pred

pred_drop = eval_model("drop (stability)", rf_drop, X_test, y_drop_test)
pred_titer = eval_model("late_titer", rf_titer, X_test, y_titer_test)
pred_agg = eval_model("late_agg", rf_agg, X_test, y_agg_test)

drop (stability)    MAE=0.1079   R2=-0.0166
late_titer          MAE=1.0048   R2=-0.0087
late_agg            MAE=2.2646   R2=-0.0301


In [17]:
# Save predictions to CSV

pred_table = pd.DataFrame({
    "clone_id": id_test.values,
    "true_drop": y_drop_test,
    "pred_drop": pred_drop,
    "true_late_titer": y_titer_test,
    "pred_late_titer": pred_titer,
    "true_late_agg": y_agg_test,
    "pred_late_agg": pred_agg
})

OUT_PRED = "../data/synthetic/processed/predictions_testset_3targets.csv"
pred_table.to_csv(OUT_PRED, index=False)
print("Saved:", OUT_PRED)

pred_table.head()

Saved: ../data/synthetic/processed/predictions_testset_3targets.csv


Unnamed: 0,clone_id,true_drop,pred_drop,true_late_titer,pred_late_titer,true_late_agg,pred_late_agg
0,CLONE_1861,0.392039,0.324808,1.302041,1.36541,5.089573,6.10453
1,CLONE_0354,0.397532,0.333808,1.405534,1.771376,5.125479,6.157633
2,CLONE_1334,0.279647,0.342409,1.612492,2.111663,4.912363,6.796212
3,CLONE_0906,0.063325,0.349004,1.142015,1.734463,6.560679,5.828375
4,CLONE_1290,0.476244,0.33439,1.864258,1.67481,6.970628,5.98836


In [18]:
# --- Spearman + TopK evaluation ---
import numpy as np
import pandas as pd

# If scipy is available, use it. If not, fallback to pandas rank corr.
try:
    from scipy.stats import spearmanr
    def spearman(a, b):
        return spearmanr(a, b).correlation
except Exception:
    def spearman(a, b):
        return pd.Series(a).rank().corr(pd.Series(b).rank())

def topk_overlap(true_scores, pred_scores, k):
    """
    Returns overlap fraction between true top-k and predicted top-k sets.
    """
    true_top = set(pd.Series(true_scores).nlargest(k).index)
    pred_top = set(pd.Series(pred_scores).nlargest(k).index)
    return len(true_top & pred_top) / k

def topk_recall_of_true_good(pred_rank, true_good_mask, k):
    """
    Among predicted top-k, what fraction are truly good?
    """
    top_idx = pred_rank.nlargest(k).index
    return true_good_mask.loc[top_idx].mean()

df_eval = pred_table.copy()

# 1) Spearman correlations (ranking quality)
# For drop: lower is better, so we correlate (-drop) to keep "higher is better" convention
print("=== Spearman (ranking correlation) ===")
print("drop (lower better):", spearman(-df_eval["true_drop"], -df_eval["pred_drop"]))
print("late_titer (higher better):", spearman(df_eval["true_late_titer"], df_eval["pred_late_titer"]))
print("late_agg (lower better):", spearman(-df_eval["true_late_agg"], -df_eval["pred_late_agg"]))

# 2) Define a "true goodness" utility for evaluation (synthetic world has true late)
# This is *evaluation only* (offline). In real projects we won't have true late at decision time.
def z(s):
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

# weights for evaluation utility (tune if you want)
A_TITER = 1.0
B_DROP  = 1.0
C_AGG   = 0.5

df_eval["true_utility"] = (
    A_TITER * z(df_eval["true_late_titer"])
    - B_DROP * z(df_eval["true_drop"])
    - C_AGG * z(df_eval["true_late_agg"])
)

df_eval["pred_utility"] = (
    A_TITER * z(df_eval["pred_late_titer"])
    - B_DROP * z(df_eval["pred_drop"])
    - C_AGG * z(df_eval["pred_late_agg"])
)

# 3) Top-K overlap and precision-like metrics for decisions
K_LIST = [5, 10, 20, 50]  # adjust to your CLD stage size

print("\n=== Top-K evaluation using utility ===")
for k in K_LIST:
    overlap = topk_overlap(df_eval["true_utility"], df_eval["pred_utility"], k)
    print(f"Top-{k} overlap (true vs pred utility): {overlap:.3f}")

# 4) Top-K "true-good" rate among predicted top-k
# Define "true good" clones as the top X% by true_utility (offline definition)
GOOD_FRAC = 0.10
good_threshold = df_eval["true_utility"].quantile(1 - GOOD_FRAC)
df_eval["true_good"] = df_eval["true_utility"] >= good_threshold

print("\n=== Top-K true-good rate among predicted picks ===")
pred_rank = df_eval["pred_utility"]
for k in K_LIST:
    rate = topk_recall_of_true_good(pred_rank, df_eval["true_good"], k)
    print(f"Predicted top-{k}: fraction that are truly top {int(GOOD_FRAC*100)}% good = {rate:.3f}")

=== Spearman (ranking correlation) ===
drop (lower better): 0.03937657706138145
late_titer (higher better): 0.012674891718073237
late_agg (lower better): -0.040323439521497

=== Top-K evaluation using utility ===
Top-5 overlap (true vs pred utility): 0.000
Top-10 overlap (true vs pred utility): 0.000
Top-20 overlap (true vs pred utility): 0.150
Top-50 overlap (true vs pred utility): 0.100

=== Top-K true-good rate among predicted picks ===
Predicted top-5: fraction that are truly top 10% good = 0.000
Predicted top-10: fraction that are truly top 10% good = 0.200
Predicted top-20: fraction that are truly top 10% good = 0.200
Predicted top-50: fraction that are truly top 10% good = 0.100


In [19]:
import pandas as pd

# X는 모델 학습에 사용한 feature matrix (DataFrame)여야 함
# 예: X = df.drop(columns=["clone_id"] + targets).copy()

def show_importance(model, X_cols, title, top_n=20):
    imp = pd.Series(model.feature_importances_, index=X_cols).sort_values(ascending=False)
    print(f"\n=== Feature importance: {title} (top {top_n}) ===")
    print(imp.head(top_n).to_string())
    return imp

# 1) 전체 상위 중요도
imp_drop = show_importance(rf_drop, X.columns, "drop (stability)")
imp_titer = show_importance(rf_titer, X.columns, "late_titer")
imp_agg = show_importance(rf_agg, X.columns, "late_agg")

# 2) ddPCR feature가 존재하는지 + 순위/중요도 확인
FEATURE_CN = "ddpcr_cn"
print("\n=== ddPCR feature check ===")
print("ddpcr_cn in X.columns?", FEATURE_CN in X.columns)

if FEATURE_CN in X.columns:
    print("\nRank of ddpcr_cn:")
    print("drop model rank:", int(imp_drop.rank(ascending=False)[FEATURE_CN]))
    print("late_titer model rank:", int(imp_titer.rank(ascending=False)[FEATURE_CN]))
    print("late_agg model rank:", int(imp_agg.rank(ascending=False)[FEATURE_CN]))

    print("\nImportance value of ddpcr_cn:")
    print("drop model:", float(imp_drop[FEATURE_CN]))
    print("late_titer model:", float(imp_titer[FEATURE_CN]))
    print("late_agg model:", float(imp_agg[FEATURE_CN]))
else:
    print("ddpcr_cn feature not found. Make sure 02d merged ddPCR into X_v2 and 02c carried it into 3targets file.")


=== Feature importance: drop (stability) (top 20) ===
titer_slope_7_10         0.030916
viability_slope_3_6      0.030114
titer_curvature          0.029731
viability_slope_7_10     0.028479
vcd_slope_3_6            0.028184
aggregation_std          0.027642
vcd_curvature            0.026840
vcd_slope_7_10           0.026640
viability_slope          0.026365
aggregation_slope_3_6    0.026313
titer_slope              0.025896
aggregation_slope        0.025703
aggregation_max          0.025657
viability_p10            0.025469
viability_curvature      0.025333
vcd_slope                0.025290
aggregation_min          0.025205
viability_max            0.024836
vcd_std                  0.024769
titer_std                0.024751

=== Feature importance: late_titer (top 20) ===
aggregation_slope        0.032746
viability_slope_7_10     0.032698
aggregation_slope_3_6    0.031682
vcd_slope                0.030967
titer_curvature          0.030829
viability_curvature      0.029981
viability_sl

In [20]:
related = [c for c in X.columns if ("titer" in c) or ("qP" in c) or ("ddpcr" in c)]
print("\nColumns related to productivity:", related[:30], "...")


Columns related to productivity: ['titer_mean', 'titer_std', 'titer_min', 'titer_max', 'titer_slope', 'titer_p10', 'titer_slope_3_6', 'titer_slope_7_10', 'titer_curvature', 'qP_mean', 'qP_p10', 'titer_cv', 'ddpcr_cn'] ...
