# Notebook 02d — Feature Engineering v2 (Early time-series features)

## Goal
Create a stronger clone-level feature table using early passages only (default: 3–10),
with features that better capture early dynamics for predicting late outcomes.

Compared to v1, we add:
- last-value features (passage 10)
- split-window slopes (3–6 vs 7–10)
- curvature (slope change)
- qP proxies (titer / VCD)
- CV (std / mean)

Outputs:
- `data/synthetic/processed/cld_features_v2.csv`
- `data/synthetic/processed/cld_features_with_label_v2.csv`

In [14]:
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path

DB_PATH = "../data/synthetic/raw/cld_2000clones.db"  # ✅ change to 500/2000/5000 as needed
conn = sqlite3.connect(DB_PATH)

print("Connected to:", DB_PATH)

Connected to: ../data/synthetic/raw/cld_2000clones.db


In [15]:
assay = pd.read_sql_query("""
SELECT 
  ar.assay_id,
  ar.assay_type,
  ar.value,
  ar.unit,
  ar.method,
  ar.batch_id,
  p.clone_id,
  p.passage_number,
  p.phase
FROM assay_result ar
JOIN passage p
  ON p.passage_id = ar.passage_id
""", conn)

assay.head()

Unnamed: 0,assay_id,assay_type,value,unit,method,batch_id,clone_id,passage_number,phase
0,ASSAY_CLONE_0001_P01_titer,titer,3.01706,g/L,ELISA,B_P01,CLONE_0001,1,early
1,ASSAY_CLONE_0001_P01_vcd,vcd,8601664.0,cells/mL,Vi-CELL,B_P01,CLONE_0001,1,early
2,ASSAY_CLONE_0001_P01_viability,viability,91.1185,%,Vi-CELL,B_P01,CLONE_0001,1,early
3,ASSAY_CLONE_0001_P01_aggregation,aggregation,3.707532,%,SEC-HPLC,B_P01,CLONE_0001,1,early
4,ASSAY_CLONE_0001_P02_titer,titer,2.818347,g/L,ELISA,B_P02,CLONE_0001,2,early


We exclude passage 1–2 due to frequent outlier behavior in real CLD.

In [16]:
EARLY_START = 3
EARLY_END = 10

assay_early = assay[(assay["passage_number"] >= EARLY_START) & (assay["passage_number"] <= EARLY_END)].copy()
print("Assay early rows:", len(assay_early))
assay_early.head()

Assay early rows: 64000


Unnamed: 0,assay_id,assay_type,value,unit,method,batch_id,clone_id,passage_number,phase
8,ASSAY_CLONE_0001_P03_titer,titer,2.750983,g/L,ELISA,B_P03,CLONE_0001,3,early
9,ASSAY_CLONE_0001_P03_vcd,vcd,10264580.0,cells/mL,Vi-CELL,B_P03,CLONE_0001,3,early
10,ASSAY_CLONE_0001_P03_viability,viability,94.42102,%,Vi-CELL,B_P03,CLONE_0001,3,early
11,ASSAY_CLONE_0001_P03_aggregation,aggregation,3.578852,%,SEC-HPLC,B_P03,CLONE_0001,3,early
12,ASSAY_CLONE_0001_P04_titer,titer,2.626062,g/L,ELISA,B_P04,CLONE_0001,4,early


In [17]:
early_wide = assay_early.pivot_table(
    index=["clone_id", "passage_number"],
    columns="assay_type",
    values="value",
    aggfunc="mean"
).reset_index()

early_wide.head()

assay_type,clone_id,passage_number,aggregation,titer,vcd,viability
0,CLONE_0001,3,3.578852,2.750983,10264580.0,94.421025
1,CLONE_0001,4,3.671787,2.626062,10397680.0,92.554019
2,CLONE_0001,5,3.638638,2.728686,11730070.0,93.598956
3,CLONE_0001,6,3.970084,2.852368,9197038.0,93.161248
4,CLONE_0001,7,3.737996,2.615099,10720850.0,92.869977


In [18]:
metrics = ["titer", "vcd", "viability", "aggregation"]

agg_dict = {m: ["mean", "std", "min", "max"] for m in metrics}

summary = early_wide.groupby("clone_id")[metrics].agg(agg_dict)
summary.columns = [f"{c[0]}_{c[1]}" for c in summary.columns]
summary = summary.reset_index()

summary.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,viability_std,viability_min,viability_max,aggregation_mean,aggregation_std,aggregation_min,aggregation_max
0,CLONE_0001,2.665436,0.145412,2.464814,2.852368,10632900.0,1025472.0,9197038.0,12361790.0,93.637077,0.821135,92.554019,95.055579,3.876291,0.276055,3.578852,4.27909
1,CLONE_0002,0.834691,0.191151,0.516513,1.171273,15128100.0,597750.6,14076260.0,16051270.0,96.283457,1.292524,94.683763,97.973846,3.326163,0.404355,2.584486,3.921443
2,CLONE_0003,3.990484,0.175857,3.722491,4.270057,8411914.0,1150419.0,6047146.0,9506059.0,93.278459,1.540622,91.327687,95.84846,5.908576,0.333873,5.353567,6.40083
3,CLONE_0004,0.540821,0.154336,0.333873,0.749828,15112980.0,605067.5,14481560.0,16263420.0,96.187877,2.045114,93.811551,100.0,7.056523,0.25732,6.697547,7.367923
4,CLONE_0005,2.16281,0.124723,1.928686,2.355251,11810710.0,732115.7,10921310.0,13285170.0,95.670482,1.419031,93.253245,97.031038,0.790245,0.321297,0.264174,1.187374


In [19]:
def slope(x, y):
    if len(x) < 2:
        return np.nan
    return np.polyfit(x, y, 1)[0]

slope_rows = []
for clone_id, df in early_wide.groupby("clone_id"):
    x = df["passage_number"].values
    row = {"clone_id": clone_id}
    for m in metrics:
        row[f"{m}_slope"] = slope(x, df[m].values) if m in df.columns else np.nan
    slope_rows.append(row)

slopes = pd.DataFrame(slope_rows)
slopes.head()

Unnamed: 0,clone_id,titer_slope,vcd_slope,viability_slope,aggregation_slope
0,CLONE_0001,-0.024274,113872.401062,0.080948,0.099104
1,CLONE_0002,0.002718,-94369.737736,0.383359,0.012096
2,CLONE_0003,-0.044016,296423.461237,0.315182,-0.014776
3,CLONE_0004,-0.026728,6003.808773,0.108264,-0.074281
4,CLONE_0005,-0.011963,-16714.746023,0.29548,-0.03582


In [20]:
# last value at EARLY_END (e.g., p10)
last_df = early_wide[early_wide["passage_number"] == EARLY_END][["clone_id"] + metrics].copy()
last_df = last_df.rename(columns={m: f"{m}_p{EARLY_END}" for m in metrics})

last_df.head()

assay_type,clone_id,titer_p10,vcd_p10,viability_p10,aggregation_p10
7,CLONE_0001,2.474828,12361790.0,93.504919,4.271647
15,CLONE_0002,0.750043,14806760.0,96.295556,3.358113
23,CLONE_0003,4.03505,9380447.0,92.604113,5.353567
31,CLONE_0004,0.333873,14728600.0,96.072153,6.771917
39,CLONE_0005,2.074356,10921310.0,97.026232,0.632086


In [21]:
def slope_in_window(df, m, start_p, end_p):
    d = df[(df["passage_number"] >= start_p) & (df["passage_number"] <= end_p)]
    if len(d) < 2 or m not in d.columns:
        return np.nan
    return slope(d["passage_number"].values, d[m].values)

split_rows = []
for clone_id, df in early_wide.groupby("clone_id"):
    row = {"clone_id": clone_id}
    for m in metrics:
        row[f"{m}_slope_{EARLY_START}_6"] = slope_in_window(df, m, EARLY_START, 6)
        row[f"{m}_slope_7_{EARLY_END}"] = slope_in_window(df, m, 7, EARLY_END)
        # curvature = change in slope (late - early)
        row[f"{m}_curvature"] = row[f"{m}_slope_7_{EARLY_END}"] - row[f"{m}_slope_{EARLY_START}_6"]
    split_rows.append(row)

splits = pd.DataFrame(split_rows)
splits.head()

Unnamed: 0,clone_id,titer_slope_3_6,titer_slope_7_10,titer_curvature,vcd_slope_3_6,vcd_slope_7_10,vcd_curvature,viability_slope_3_6,viability_slope_7_10,viability_curvature,aggregation_slope_3_6,aggregation_slope_7_10,aggregation_curvature
0,CLONE_0001,0.040678,-0.007498,-0.048176,-187022.4,389773.397588,576795.789809,-0.273439,0.302951,0.57639,0.114054,0.201781,0.087727
1,CLONE_0002,0.011971,0.109412,0.097441,-288401.3,-230868.50191,57532.810771,0.202766,-0.445791,-0.648556,-0.25997,-0.21439,0.045579
2,CLONE_0003,-0.079378,0.078764,0.158142,1015721.0,607611.053176,-408109.702447,0.82683,0.024123,-0.802707,0.11459,-0.181735,-0.296325
3,CLONE_0004,0.070029,-0.10738,-0.177409,404815.4,-337551.500997,-742366.905968,0.01565,1.23392,1.21827,-0.02401,-0.197628,-0.173618
4,CLONE_0005,0.037917,0.042757,0.00484,-134575.7,-274279.401363,-139703.662076,0.294659,0.379384,0.084726,0.039283,0.050533,0.01125


In [22]:
# qP proxy: titer / vcd
# add small epsilon to avoid division issues
eps = 1e-9

qp = summary[["clone_id"]].copy()
qp["qP_mean"] = summary["titer_mean"] / (summary["vcd_mean"] + eps)
qp["qP_p10"]  = last_df[f"titer_p{EARLY_END}"] / (last_df[f"vcd_p{EARLY_END}"] + eps)

# CV features: std/mean
cv = summary[["clone_id"]].copy()
for m in metrics:
    cv[f"{m}_cv"] = summary[f"{m}_std"] / (summary[f"{m}_mean"] + eps)

qp.head(), cv.head()

(     clone_id       qP_mean  qP_p10
 0  CLONE_0001  2.506782e-07     NaN
 1  CLONE_0002  5.517484e-08     NaN
 2  CLONE_0003  4.743848e-07     NaN
 3  CLONE_0004  3.578521e-08     NaN
 4  CLONE_0005  1.831228e-07     NaN,
      clone_id  titer_cv    vcd_cv  viability_cv  aggregation_cv
 0  CLONE_0001  0.054555  0.096443      0.008769        0.071216
 1  CLONE_0002  0.229008  0.039513      0.013424        0.121568
 2  CLONE_0003  0.044069  0.136761      0.016516        0.056506
 3  CLONE_0004  0.285374  0.040036      0.021262        0.036466
 4  CLONE_0005  0.057667  0.061987      0.014832        0.406579)

In [23]:
X_v2 = summary.merge(slopes, on="clone_id", how="left") \
              .merge(last_df, on="clone_id", how="left") \
              .merge(splits, on="clone_id", how="left") \
              .merge(qp, on="clone_id", how="left") \
              .merge(cv, on="clone_id", how="left")

print("X_v2 shape:", X_v2.shape)
X_v2.head()

X_v2 shape: (2000, 43)


Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,viability_curvature,aggregation_slope_3_6,aggregation_slope_7_10,aggregation_curvature,qP_mean,qP_p10,titer_cv,vcd_cv,viability_cv,aggregation_cv
0,CLONE_0001,2.665436,0.145412,2.464814,2.852368,10632900.0,1025472.0,9197038.0,12361790.0,93.637077,...,0.57639,0.114054,0.201781,0.087727,2.506782e-07,,0.054555,0.096443,0.008769,0.071216
1,CLONE_0002,0.834691,0.191151,0.516513,1.171273,15128100.0,597750.6,14076260.0,16051270.0,96.283457,...,-0.648556,-0.25997,-0.21439,0.045579,5.517484e-08,,0.229008,0.039513,0.013424,0.121568
2,CLONE_0003,3.990484,0.175857,3.722491,4.270057,8411914.0,1150419.0,6047146.0,9506059.0,93.278459,...,-0.802707,0.11459,-0.181735,-0.296325,4.743848e-07,,0.044069,0.136761,0.016516,0.056506
3,CLONE_0004,0.540821,0.154336,0.333873,0.749828,15112980.0,605067.5,14481560.0,16263420.0,96.187877,...,1.21827,-0.02401,-0.197628,-0.173618,3.578521e-08,,0.285374,0.040036,0.021262,0.036466
4,CLONE_0005,2.16281,0.124723,1.928686,2.355251,11810710.0,732115.7,10921310.0,13285170.0,95.670482,...,0.084726,0.039283,0.050533,0.01125,1.831228e-07,,0.057667,0.061987,0.014832,0.406579


In [24]:
y = pd.read_sql_query("""
SELECT clone_id, productivity_drop_pct
FROM stability_test
""", conn)

dataset_v2 = X_v2.merge(y, on="clone_id", how="inner")
print("dataset_v2 shape:", dataset_v2.shape)

dataset_v2.head()

dataset_v2 shape: (2000, 44)


Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,aggregation_slope_3_6,aggregation_slope_7_10,aggregation_curvature,qP_mean,qP_p10,titer_cv,vcd_cv,viability_cv,aggregation_cv,productivity_drop_pct
0,CLONE_0001,2.665436,0.145412,2.464814,2.852368,10632900.0,1025472.0,9197038.0,12361790.0,93.637077,...,0.114054,0.201781,0.087727,2.506782e-07,,0.054555,0.096443,0.008769,0.071216,0.229719
1,CLONE_0002,0.834691,0.191151,0.516513,1.171273,15128100.0,597750.6,14076260.0,16051270.0,96.283457,...,-0.25997,-0.21439,0.045579,5.517484e-08,,0.229008,0.039513,0.013424,0.121568,0.356246
2,CLONE_0003,3.990484,0.175857,3.722491,4.270057,8411914.0,1150419.0,6047146.0,9506059.0,93.278459,...,0.11459,-0.181735,-0.296325,4.743848e-07,,0.044069,0.136761,0.016516,0.056506,0.281589
3,CLONE_0004,0.540821,0.154336,0.333873,0.749828,15112980.0,605067.5,14481560.0,16263420.0,96.187877,...,-0.02401,-0.197628,-0.173618,3.578521e-08,,0.285374,0.040036,0.021262,0.036466,0.02616
4,CLONE_0005,2.16281,0.124723,1.928686,2.355251,11810710.0,732115.7,10921310.0,13285170.0,95.670482,...,0.039283,0.050533,0.01125,1.831228e-07,,0.057667,0.061987,0.014832,0.406579,0.382269


In [25]:
OUT_DIR = Path("../data/synthetic/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

X_v2.to_csv(OUT_DIR / "cld_features_v2.csv", index=False)
dataset_v2.to_csv(OUT_DIR / "cld_features_with_label_v2.csv", index=False)

print("Saved:")
print("-", OUT_DIR / "cld_features_v2.csv")
print("-", OUT_DIR / "cld_features_with_label_v2.csv")

Saved:
- ../data/synthetic/processed/cld_features_v2.csv
- ../data/synthetic/processed/cld_features_with_label_v2.csv


In [26]:
conn.close()
print("DB connection closed.")

DB connection closed.
