# Notebook 03b - Multi-target Models for CLD (Early -> Late Prediction)

## Goal
Train separate regression models that predict late-stage outcomes using **early-only features**:
- Stability: productivity_drop_pct (lower is better)
- Productivity: late_mean_titer (higher is better)
- Quality: late_mean_aggregation (lower is better)

These predicted values will be used in Notebook 04 to perform **predicted-late-based clone selection**, wihch mirrors real CLD projects where late data is not available at decision time.

In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [18]:
# Load dataset with three target

DATA_PATH = "/Users/sang/CLD_ML_PIPELINE/data/synthetic/processed/cld_features_with_labels_3targets_v2.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,aggregation_curvature,qP_mean,qP_p10,titer_cv,vcd_cv,viability_cv,aggregation_cv,productivity_drop_pct,late_mean_titer,late_mean_aggregation
0,CLONE_0001,2.665436,0.145412,2.464814,2.852368,10632900.0,1025472.0,9197038.0,12361790.0,93.637077,...,0.087727,2.506782e-07,,0.054555,0.096443,0.008769,0.071216,0.229719,2.053135,4.310553
1,CLONE_0002,0.834691,0.191151,0.516513,1.171273,15128100.0,597750.6,14076260.0,16051270.0,96.283457,...,0.045579,5.517484e-08,,0.229008,0.039513,0.013424,0.121568,0.356246,0.537335,3.259003
2,CLONE_0003,3.990484,0.175857,3.722491,4.270057,8411914.0,1150419.0,6047146.0,9506059.0,93.278459,...,-0.296325,4.743848e-07,,0.044069,0.136761,0.016516,0.056506,0.281589,2.866808,5.945068
3,CLONE_0004,0.540821,0.154336,0.333873,0.749828,15112980.0,605067.5,14481560.0,16263420.0,96.187877,...,-0.173618,3.578521e-08,,0.285374,0.040036,0.021262,0.036466,0.02616,0.526673,7.351199
4,CLONE_0005,2.16281,0.124723,1.928686,2.355251,11810710.0,732115.7,10921310.0,13285170.0,95.670482,...,0.01125,1.831228e-07,,0.057667,0.061987,0.014832,0.406579,0.382269,1.336034,1.133822


In [19]:
# Preapre X and 3 ys

targets = ["productivity_drop_pct", "late_mean_titer", "late_mean_aggregation"]

clone_ids = df["clone_id"].copy()

X = df.drop(columns=["clone_id"] + targets).copy()
X = X.fillna(X.median(numeric_only=True))

y_drop = df["productivity_drop_pct"].copy().clip(lower=0.0, upper=1.0)   # stability: clamp to [0,1]
y_titer = df["late_mean_titer"].copy()                                   # late productivity
y_agg = df["late_mean_aggregation"].copy().clip(lower=0.0, upper=100.0)  # quality proxy: clamp to [0,100]

print("X shape:", X.shape)
print("y_drop:", y_drop.shape, "y_titer:", y_titer.shape, "y_agg:", y_agg.shape)

X shape: (2000, 42)
y_drop: (2000,) y_titer: (2000,) y_agg: (2000,)


In [20]:
# Split train/test by clone_id to avoid data leakage

X_train, X_test, id_train, id_test = train_test_split(
    X, clone_ids, test_size=0.2, random_state=42
)

# Align y by index using clone_id (safer than relying on row order)
train_mask = df["clone_id"].isin(id_train)
test_mask  = df["clone_id"].isin(id_test)

y_drop_train, y_drop_test = y_drop[train_mask].values, y_drop[test_mask].values
y_titer_train, y_titer_test = y_titer[train_mask].values, y_titer[test_mask].values
y_agg_train, y_agg_test = y_agg[train_mask].values, y_agg[test_mask].values

print("Train size:", len(id_train), "Test size:", len(id_test))

Train size: 1600 Test size: 400


In [21]:
# Train 3 Random Forest models, one for each target

def fit_rf(X_train, y_train):
    model = RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        min_samples_leaf=5,
        max_features="sqrt"
    )
    model.fit(X_train, y_train)
    return model

rf_drop = fit_rf(X_train, y_drop_train)
rf_titer = fit_rf(X_train, y_titer_train)
rf_agg = fit_rf(X_train, y_agg_train)

In [22]:
# Evaluate models on test set

def eval_model(name, model, X_test, y_test):
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{name:18s}  MAE={mae:.4f}   R2={r2:.4f}")
    return pred

pred_drop = eval_model("drop (stability)", rf_drop, X_test, y_drop_test)
pred_titer = eval_model("late_titer", rf_titer, X_test, y_titer_test)
pred_agg = eval_model("late_agg", rf_agg, X_test, y_agg_test)

drop (stability)    MAE=0.1048   R2=-0.0292
late_titer          MAE=0.8596   R2=-0.0153
late_agg            MAE=2.2433   R2=-0.0235


In [23]:
# Save predictions to CSV

pred_table = pd.DataFrame({
    "clone_id": id_test.values,
    "true_drop": y_drop_test,
    "pred_drop": pred_drop,
    "true_late_titer": y_titer_test,
    "pred_late_titer": pred_titer,
    "true_late_agg": y_agg_test,
    "pred_late_agg": pred_agg
})

OUT_PRED = "../data/synthetic/processed/predictions_testset_3targets.csv"
pred_table.to_csv(OUT_PRED, index=False)
print("Saved:", OUT_PRED)

pred_table.head()

Saved: ../data/synthetic/processed/predictions_testset_3targets.csv


Unnamed: 0,clone_id,true_drop,pred_drop,true_late_titer,pred_late_titer,true_late_agg,pred_late_agg
0,CLONE_1861,0.413428,0.285054,0.87811,1.508999,3.939495,4.883162
1,CLONE_0354,0.352329,0.288104,1.120978,1.436453,4.282821,4.295063
2,CLONE_1334,0.198136,0.279236,1.417534,1.672492,4.19076,6.592623
3,CLONE_0906,0.150225,0.274753,0.874246,1.378208,5.308029,5.138363
4,CLONE_1290,0.26308,0.282855,2.080281,1.564748,6.445204,4.556844
