# Notebook 03b - Multi-target Models for CLD (Early -> Late Prediction)

## Goal
Train separate regression models that predict late-stage outcomes using **early-only features**:
- Stability: productivity_drop_pct (lower is better)
- Productivity: late_mean_titer (higher is better)
- Quality: late_mean_aggregation (lower is better)

These predicted values will be used in Notebook 04 to perform **predicted-late-based clone selection**, wihch mirrors real CLD projects where late data is not available at decision time.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [4]:
# Load dataset with three target

DATA_PATH = "../data/synthetic/processed/cld_features_with_labels_3targets.csv"
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,aggregation_std,aggregation_min,aggregation_max,titer_slope,vcd_slope,viability_slope,aggregation_slope,productivity_drop_pct,late_mean_titer,late_mean_aggregation
0,CLONE_0001,2.538067,0.293036,2.223711,2.964514,11076190.0,905255.6,9779103.0,12322580.0,94.851455,...,0.403848,7.723455,8.89403,-0.103703,102268.665747,0.169734,-0.019225,0.387063,1.555675,8.137307
1,CLONE_0002,0.814721,0.213007,0.537981,1.132518,14410910.0,1047019.0,13460700.0,16053410.0,97.551824,...,0.449925,6.798531,8.011004,-0.005035,279733.381794,0.198861,-0.023819,0.135156,0.704606,7.603613
2,CLONE_0003,3.912552,0.208697,3.621956,4.261524,8684126.0,583412.0,7780120.0,9384241.0,94.390688,...,0.326064,1.745532,2.861836,-0.058117,51853.805966,0.505344,0.038583,0.335258,2.600837,2.209127
3,CLONE_0004,0.488369,0.160312,0.212916,0.747609,15117250.0,781766.5,14054530.0,16064250.0,96.380534,...,0.305128,3.357574,4.286918,-0.026671,278866.456374,0.504642,0.051834,0.590633,0.199922,4.12072
4,CLONE_0005,2.238289,0.160672,2.033612,2.459557,11171950.0,1167196.0,8874346.0,12390300.0,95.085238,...,0.405157,2.620058,4.001488,-0.034059,391064.583721,0.233446,-0.129212,0.291757,1.585253,3.213549


In [5]:
# Preapre X and 3 ys

targets = ["productivity_drop_pct", "late_mean_titer", "late_mean_aggregation"]

clone_ids = df["clone_id"].copy()

X = df.drop(columns=["clone_id"] + targets).copy()
X = X.fillna(X.median(numeric_only=True))

y_drop = df["productivity_drop_pct"].copy().clip(lower=0.0, upper=1.0)   # stability: clamp to [0,1]
y_titer = df["late_mean_titer"].copy()                                   # late productivity
y_agg = df["late_mean_aggregation"].copy().clip(lower=0.0, upper=100.0)  # quality proxy: clamp to [0,100]

print("X shape:", X.shape)
print("y_drop:", y_drop.shape, "y_titer:", y_titer.shape, "y_agg:", y_agg.shape)

X shape: (500, 20)
y_drop: (500,) y_titer: (500,) y_agg: (500,)


In [6]:
# Split train/test by clone_id to avoid data leakage

X_train, X_test, id_train, id_test = train_test_split(
    X, clone_ids, test_size=0.2, random_state=42
)

# Align y by index using clone_id (safer than relying on row order)
train_mask = df["clone_id"].isin(id_train)
test_mask  = df["clone_id"].isin(id_test)

y_drop_train, y_drop_test = y_drop[train_mask].values, y_drop[test_mask].values
y_titer_train, y_titer_test = y_titer[train_mask].values, y_titer[test_mask].values
y_agg_train, y_agg_test = y_agg[train_mask].values, y_agg[test_mask].values

print("Train size:", len(id_train), "Test size:", len(id_test))

Train size: 400 Test size: 100


In [7]:
# Train 3 Random Forest models, one for each target

def fit_rf(X_train, y_train):
    model = RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        min_samples_leaf=5,
        max_features="sqrt"
    )
    model.fit(X_train, y_train)
    return model

rf_drop = fit_rf(X_train, y_drop_train)
rf_titer = fit_rf(X_train, y_titer_train)
rf_agg = fit_rf(X_train, y_agg_train)

In [8]:
# Evaluate models on test set

def eval_model(name, model, X_test, y_test):
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{name:18s}  MAE={mae:.4f}   R2={r2:.4f}")
    return pred

pred_drop = eval_model("drop (stability)", rf_drop, X_test, y_drop_test)
pred_titer = eval_model("late_titer", rf_titer, X_test, y_titer_test)
pred_agg = eval_model("late_agg", rf_agg, X_test, y_agg_test)

drop (stability)    MAE=0.0852   R2=-0.0574
late_titer          MAE=0.7410   R2=-0.0948
late_agg            MAE=2.1501   R2=-0.0620


In [10]:
# Save predictions to CSV

pred_table = pd.DataFrame({
    "clone_id": id_test.values,
    "true_drop": y_drop_test,
    "pred_drop": pred_drop,
    "true_late_titer": y_titer_test,
    "pred_late_titer": pred_titer,
    "true_late_agg": y_agg_test,
    "pred_late_agg": pred_agg
})

OUT_PRED = "../data/synthetic/processed/predictions_testset_3targets.csv"
pred_table.to_csv(OUT_PRED, index=False)
print("Saved:", OUT_PRED)

pred_table.head()

Saved: ../data/synthetic/processed/predictions_testset_3targets.csv


Unnamed: 0,clone_id,true_drop,pred_drop,true_late_titer,pred_late_titer,true_late_agg,pred_late_agg
0,CLONE_0362,0.387063,0.267649,1.555675,1.146125,8.137307,5.344222
1,CLONE_0074,0.335258,0.282072,2.600837,1.164593,2.209127,5.707408
2,CLONE_0375,0.249261,0.256892,0.845235,1.289564,1.484692,5.345388
3,CLONE_0156,0.274382,0.273758,1.098513,1.659863,5.089938,4.716175
4,CLONE_0105,0.249853,0.255309,3.331767,1.40486,5.319159,5.084001
