# Notebook 02d — Feature Engineering v2 (Early time-series features)

## Goal
Create a stronger clone-level feature table using early passages only (default: 3–10),
with features that better capture early dynamics for predicting late outcomes.

Compared to v1, we add:
- last-value features (passage 10)
- split-window slopes (3–6 vs 7–10)
- curvature (slope change)
- qP proxies (titer / VCD)
- CV (std / mean)

Outputs:
- `data/synthetic/processed/cld_features_v2.csv`
- `data/synthetic/processed/cld_features_with_label_v2.csv`

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path

DB_PATH = "../data/synthetic/raw/cld_2000clones.db"  # change to 500/2000/5000 as needed
conn = sqlite3.connect(DB_PATH)

print("Connected to:", DB_PATH)

Connected to: ../data/synthetic/raw/cld_2000clones.db


In [2]:
assay = pd.read_sql_query("""
SELECT 
  ar.assay_id,
  ar.assay_type,
  ar.value,
  ar.unit,
  ar.method,
  ar.batch_id,
  p.clone_id,
  p.passage_number,
  p.phase
FROM assay_result ar
JOIN passage p
  ON p.passage_id = ar.passage_id
""", conn)

assay.head()

Unnamed: 0,assay_id,assay_type,value,unit,method,batch_id,clone_id,passage_number,phase
0,ASSAY_CLONE_0001_P01_titer,titer,3.496986,g/L,ELISA,B_P01,CLONE_0001,1,early
1,ASSAY_CLONE_0001_P01_vcd,vcd,10210560.0,cells/mL,Vi-CELL,B_P01,CLONE_0001,1,early
2,ASSAY_CLONE_0001_P01_viability,viability,92.82322,%,Vi-CELL,B_P01,CLONE_0001,1,early
3,ASSAY_CLONE_0001_P01_aggregation,aggregation,4.440035,%,SEC-HPLC,B_P01,CLONE_0001,1,early
4,ASSAY_CLONE_0001_P02_titer,titer,3.293743,g/L,ELISA,B_P02,CLONE_0001,2,early


We exclude passage 1–2 due to frequent outlier behavior in real CLD.

In [3]:
EARLY_START = 3
EARLY_END = 10

assay_early = assay[(assay["passage_number"] >= EARLY_START) & (assay["passage_number"] <= EARLY_END)].copy()
print("Assay early rows:", len(assay_early))
assay_early.head()

Assay early rows: 66000


Unnamed: 0,assay_id,assay_type,value,unit,method,batch_id,clone_id,passage_number,phase
8,ASSAY_CLONE_0001_P03_titer,titer,3.106693,g/L,ELISA,B_P03,CLONE_0001,3,early
9,ASSAY_CLONE_0001_P03_vcd,vcd,11172110.0,cells/mL,Vi-CELL,B_P03,CLONE_0001,3,early
10,ASSAY_CLONE_0001_P03_viability,viability,93.76113,%,Vi-CELL,B_P03,CLONE_0001,3,early
11,ASSAY_CLONE_0001_P03_aggregation,aggregation,4.755091,%,SEC-HPLC,B_P03,CLONE_0001,3,early
12,ASSAY_CLONE_0001_P03_ddpcr_cn,ddpcr_cn,2.0,copies/cell,ddPCR,B_P03,CLONE_0001,3,early


In [4]:
# -----------------------------
# ddPCR copy number feature (clone-level)
# - ddpcr_cn is stored once per clone at passage = EARLY_START (e.g., 3)
# - We convert it to a single clone-level feature column: ddpcr_cn
# -----------------------------

ddpcr = assay_early[assay_early["assay_type"] == "ddpcr_cn"][["clone_id", "value"]].copy()
ddpcr = ddpcr.rename(columns={"value": "ddpcr_cn"})

# If multiple rows exist (shouldn't, but safe), take mean
ddpcr = ddpcr.groupby("clone_id", as_index=False)["ddpcr_cn"].mean()

print("ddPCR rows:", len(ddpcr))
ddpcr.head()

ddPCR rows: 2000


Unnamed: 0,clone_id,ddpcr_cn
0,CLONE_0001,2.0
1,CLONE_0002,3.0
2,CLONE_0003,2.0
3,CLONE_0004,2.0
4,CLONE_0005,3.0


In [5]:
early_wide = assay_early.pivot_table(
    index=["clone_id", "passage_number"],
    columns="assay_type",
    values="value",
    aggfunc="mean"
).reset_index()

early_wide.head()

assay_type,clone_id,passage_number,aggregation,ddpcr_cn,titer,vcd,viability
0,CLONE_0001,3,4.755091,2.0,3.106693,11172110.0,93.761132
1,CLONE_0001,4,4.579014,,3.243198,10502040.0,94.508927
2,CLONE_0001,5,4.47217,,3.122604,11831610.0,93.723408
3,CLONE_0001,6,4.742394,,3.361966,10096740.0,94.925282
4,CLONE_0001,7,4.780426,,2.908198,11898430.0,95.303396


In [6]:
metrics = ["titer", "vcd", "viability", "aggregation"]

agg_dict = {m: ["mean", "std", "min", "max"] for m in metrics}

summary = early_wide.groupby("clone_id")[metrics].agg(agg_dict)
summary.columns = [f"{c[0]}_{c[1]}" for c in summary.columns]
summary = summary.reset_index()

summary.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,viability_std,viability_min,viability_max,aggregation_mean,aggregation_std,aggregation_min,aggregation_max
0,CLONE_0001,3.135307,0.15306,2.908198,3.361966,11158540.0,775266.9,10096740.0,12294130.0,94.340916,1.236634,91.789294,95.594408,4.624233,0.218603,4.159051,4.799644
1,CLONE_0002,1.089709,0.201227,0.881955,1.476979,14533580.0,569550.1,13359300.0,15132320.0,96.108006,1.879599,94.000022,98.978138,3.509097,0.338647,3.021414,3.965517
2,CLONE_0003,4.715356,0.202982,4.36331,4.991713,9132412.0,799242.1,7744497.0,10047470.0,93.691616,1.949636,91.067676,97.659881,6.662889,0.41247,5.850079,7.081793
3,CLONE_0004,0.729517,0.140272,0.541439,0.88702,15322590.0,1022267.0,13804880.0,16854200.0,97.318163,1.361175,94.667649,99.239597,8.107287,0.221657,7.846371,8.540163
4,CLONE_0005,2.480311,0.215895,2.122646,2.781607,11696200.0,1088390.0,9654663.0,13619380.0,95.337131,1.995718,91.805981,97.887004,1.969017,0.288554,1.458238,2.330961


In [7]:
def slope(x, y):
    if len(x) < 2:
        return np.nan
    return np.polyfit(x, y, 1)[0]

slope_rows = []
for clone_id, df in early_wide.groupby("clone_id"):
    x = df["passage_number"].values
    row = {"clone_id": clone_id}
    for m in metrics:
        row[f"{m}_slope"] = slope(x, df[m].values) if m in df.columns else np.nan
    slope_rows.append(row)

slopes = pd.DataFrame(slope_rows)
slopes.head()

Unnamed: 0,clone_id,titer_slope,vcd_slope,viability_slope,aggregation_slope
0,CLONE_0001,-0.016041,14782.194888,-0.045275,-0.012478
1,CLONE_0002,-0.009366,-57743.82655,0.36679,0.020807
2,CLONE_0003,-0.063262,210030.952788,-0.2632,0.083494
3,CLONE_0004,0.0238,91570.588994,-0.315058,0.03826
4,CLONE_0005,-0.071277,114437.482379,0.454439,0.032461


In [8]:
# last value at EARLY_END (e.g., p10)
last_df = early_wide[early_wide["passage_number"] == EARLY_END][["clone_id"] + metrics].copy()
last_df = last_df.rename(columns={m: f"{m}_p{EARLY_END}" for m in metrics})

last_df.head()

assay_type,clone_id,titer_p10,vcd_p10,viability_p10,aggregation_p10
7,CLONE_0001,3.127747,10700600.0,91.789294,4.799644
15,CLONE_0002,1.208836,14645480.0,96.458442,3.743098
23,CLONE_0003,4.675628,10047470.0,91.067676,6.676099
31,CLONE_0004,0.865722,15099620.0,94.667649,8.14796
39,CLONE_0005,2.384941,11744740.0,97.342756,1.675812


In [9]:
def slope_in_window(df, m, start_p, end_p):
    d = df[(df["passage_number"] >= start_p) & (df["passage_number"] <= end_p)]
    if len(d) < 2 or m not in d.columns:
        return np.nan
    return slope(d["passage_number"].values, d[m].values)

split_rows = []
for clone_id, df in early_wide.groupby("clone_id"):
    row = {"clone_id": clone_id}
    for m in metrics:
        row[f"{m}_slope_{EARLY_START}_6"] = slope_in_window(df, m, EARLY_START, 6)
        row[f"{m}_slope_7_{EARLY_END}"] = slope_in_window(df, m, 7, EARLY_END)
        # curvature = change in slope (late - early)
        row[f"{m}_curvature"] = row[f"{m}_slope_7_{EARLY_END}"] - row[f"{m}_slope_{EARLY_START}_6"]
    split_rows.append(row)

splits = pd.DataFrame(split_rows)
splits.head()

Unnamed: 0,clone_id,titer_slope_3_6,titer_slope_7_10,titer_curvature,vcd_slope_3_6,vcd_slope_7_10,vcd_curvature,viability_slope_3_6,viability_slope_7_10,viability_curvature,aggregation_slope_3_6,aggregation_slope_7_10,aggregation_curvature
0,CLONE_0001,0.064522,0.035321,-0.029201,-189654.729024,-511498.127082,-321843.4,0.270693,-1.006938,-1.277631,-0.014493,-0.048937,-0.034444
1,CLONE_0002,0.095352,0.063887,-0.031465,-10269.447233,248731.605111,259001.1,-0.461976,-0.140552,0.321425,-0.060197,0.073192,0.13339
2,CLONE_0003,-0.058192,-0.012949,0.045243,516646.652324,163417.041172,-353229.6,0.761037,-0.414967,-1.176004,0.405413,0.022836,-0.382577
3,CLONE_0004,0.072696,0.09504,0.022344,49106.885757,-28541.484846,-77648.37,0.278075,-0.909356,-1.187431,0.021945,0.056316,0.034371
4,CLONE_0005,-0.003985,-0.020517,-0.016532,944200.151614,-71164.088863,-1015364.0,0.259886,1.315187,1.055302,0.075487,-0.144875,-0.220361


In [10]:
# qP proxy: titer / vcd
# add small epsilon to avoid division issues
eps = 1e-9

qp = summary[["clone_id"]].copy()
qp["qP_mean"] = summary["titer_mean"] / (summary["vcd_mean"] + eps)

# merge last values for p10
last_for_qp = last_df[["clone_id", f"titer_p{EARLY_END}", f"vcd_p{EARLY_END}"]].copy()
qp = qp.merge(last_for_qp, on="clone_id", how="left")

qp["qP_p10"] = qp[f"titer_p{EARLY_END}"] / (qp[f"vcd_p{EARLY_END}"] + eps)

# optional: drop the temporary columns
qp = qp.drop(columns=[f"titer_p{EARLY_END}", f"vcd_p{EARLY_END}"])

In [11]:
# CV features (Coefficient of Variation): std / mean
# This captures "noisiness" relative to the signal level.

eps = 1e-9

cv = summary[["clone_id"]].copy()
for m in metrics:
    cv[f"{m}_cv"] = summary[f"{m}_std"] / (summary[f"{m}_mean"] + eps)

cv.head()

Unnamed: 0,clone_id,titer_cv,vcd_cv,viability_cv,aggregation_cv
0,CLONE_0001,0.048818,0.069477,0.013108,0.047273
1,CLONE_0002,0.184661,0.039189,0.019557,0.096506
2,CLONE_0003,0.043047,0.087517,0.020809,0.061906
3,CLONE_0004,0.192281,0.066716,0.013987,0.02734
4,CLONE_0005,0.087044,0.093055,0.020933,0.146547


In [12]:
# We want one culture_mode per clone.
# process_condition is per passage_id, so we join with passage to get clone_id.

mode_df = pd.read_sql_query("""
SELECT p.clone_id, pc.culture_mode
FROM process_condition pc
JOIN passage p ON p.passage_id = pc.passage_id
""", conn)

# If culture_mode varies across passages (rare in our simulator), take the most frequent mode per clone
mode_major = (
    mode_df.groupby(["clone_id", "culture_mode"]).size().reset_index(name="n")
    .sort_values(["clone_id", "n"], ascending=[True, False])
    .drop_duplicates("clone_id")[["clone_id", "culture_mode"]]
)

# One-hot encode (fed-batch/perfusion)
mode_ohe = pd.get_dummies(mode_major["culture_mode"], prefix="culture_mode")
mode_features = pd.concat([mode_major[["clone_id"]], mode_ohe], axis=1)

mode_features.head()

Unnamed: 0,clone_id,culture_mode_fed-batch,culture_mode_perfusion
0,CLONE_0001,False,True
1,CLONE_0002,True,False
2,CLONE_0003,True,False
3,CLONE_0004,True,False
4,CLONE_0005,True,False


In [13]:
X_v2 = summary.merge(slopes, on="clone_id", how="left") \
              .merge(last_df, on="clone_id", how="left") \
              .merge(splits, on="clone_id", how="left") \
              .merge(qp, on="clone_id", how="left") \
              .merge(cv, on="clone_id", how="left") \
              .merge(mode_features, on="clone_id", how="left") \
              .merge(ddpcr, on="clone_id", how="left")

print("X_v2 shape:", X_v2.shape)
X_v2.head()

X_v2 shape: (2000, 46)


Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,aggregation_curvature,qP_mean,qP_p10,titer_cv,vcd_cv,viability_cv,aggregation_cv,culture_mode_fed-batch,culture_mode_perfusion,ddpcr_cn
0,CLONE_0001,3.135307,0.15306,2.908198,3.361966,11158540.0,775266.9,10096740.0,12294130.0,94.340916,...,-0.034444,2.809783e-07,2.922963e-07,0.048818,0.069477,0.013108,0.047273,False,True,2.0
1,CLONE_0002,1.089709,0.201227,0.881955,1.476979,14533580.0,569550.1,13359300.0,15132320.0,96.108006,...,0.13339,7.497872e-08,8.253985e-08,0.184661,0.039189,0.019557,0.096506,True,False,3.0
2,CLONE_0003,4.715356,0.202982,4.36331,4.991713,9132412.0,799242.1,7744497.0,10047470.0,93.691616,...,-0.382577,5.163319e-07,4.653539e-07,0.043047,0.087517,0.020809,0.061906,True,False,2.0
3,CLONE_0004,0.729517,0.140272,0.541439,0.88702,15322590.0,1022267.0,13804880.0,16854200.0,97.318163,...,0.034371,4.761053e-08,5.733401e-08,0.192281,0.066716,0.013987,0.02734,True,False,2.0
4,CLONE_0005,2.480311,0.215895,2.122646,2.781607,11696200.0,1088390.0,9654663.0,13619380.0,95.337131,...,-0.220361,2.120613e-07,2.030646e-07,0.087044,0.093055,0.020933,0.146547,True,False,3.0


In [14]:
X_v2[["ddpcr_cn"]].isna().mean()

ddpcr_cn    0.0
dtype: float64

In [15]:
y = pd.read_sql_query("""
SELECT clone_id, productivity_drop_pct
FROM stability_test
""", conn)

dataset_v2 = X_v2.merge(y, on="clone_id", how="inner")
print("dataset_v2 shape:", dataset_v2.shape)
dataset_v2.head()

dataset_v2 shape: (2000, 47)


Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,qP_mean,qP_p10,titer_cv,vcd_cv,viability_cv,aggregation_cv,culture_mode_fed-batch,culture_mode_perfusion,ddpcr_cn,productivity_drop_pct
0,CLONE_0001,3.135307,0.15306,2.908198,3.361966,11158540.0,775266.9,10096740.0,12294130.0,94.340916,...,2.809783e-07,2.922963e-07,0.048818,0.069477,0.013108,0.047273,False,True,2.0,0.271237
1,CLONE_0002,1.089709,0.201227,0.881955,1.476979,14533580.0,569550.1,13359300.0,15132320.0,96.108006,...,7.497872e-08,8.253985e-08,0.184661,0.039189,0.019557,0.096506,True,False,3.0,0.52492
2,CLONE_0003,4.715356,0.202982,4.36331,4.991713,9132412.0,799242.1,7744497.0,10047470.0,93.691616,...,5.163319e-07,4.653539e-07,0.043047,0.087517,0.020809,0.061906,True,False,2.0,0.338851
3,CLONE_0004,0.729517,0.140272,0.541439,0.88702,15322590.0,1022267.0,13804880.0,16854200.0,97.318163,...,4.761053e-08,5.733401e-08,0.192281,0.066716,0.013987,0.02734,True,False,2.0,0.646568
4,CLONE_0005,2.480311,0.215895,2.122646,2.781607,11696200.0,1088390.0,9654663.0,13619380.0,95.337131,...,2.120613e-07,2.030646e-07,0.087044,0.093055,0.020933,0.146547,True,False,3.0,0.492373


In [16]:
# Optional sanity check: missing rate per column
dataset_v2.isna().mean().sort_values(ascending=False).head(15)

clone_id                  0.0
aggregation_slope_7_10    0.0
titer_slope_7_10          0.0
titer_curvature           0.0
vcd_slope_3_6             0.0
vcd_slope_7_10            0.0
vcd_curvature             0.0
viability_slope_3_6       0.0
viability_slope_7_10      0.0
viability_curvature       0.0
aggregation_slope_3_6     0.0
aggregation_curvature     0.0
aggregation_p10           0.0
qP_mean                   0.0
qP_p10                    0.0
dtype: float64

In [17]:
dataset_v2[["culture_mode_fed-batch", "culture_mode_perfusion"]].mean()

culture_mode_fed-batch    0.8545
culture_mode_perfusion    0.1455
dtype: float64

In [18]:
OUT_DIR = Path("../data/synthetic/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

X_v2.to_csv(OUT_DIR / "cld_features_v2.csv", index=False)
dataset_v2.to_csv(OUT_DIR / "cld_features_with_label_v2.csv", index=False)

print("Saved:")
print("-", OUT_DIR / "cld_features_v2.csv")
print("-", OUT_DIR / "cld_features_with_label_v2.csv")

Saved:
- ../data/synthetic/processed/cld_features_v2.csv
- ../data/synthetic/processed/cld_features_with_label_v2.csv


In [19]:
conn.close()
print("DB connection closed.")

DB connection closed.
