In [1]:
N_CLONES = 2000
DB_PATH = f"../data/synthetic/raw/cld_{N_CLONES}clones.db"

# Notebook 03 - ML modeling for CLD (Stability Prediction)

## Goal
Train machine learning models that predict stability using early CLD measurements.

We will build two model types:
1) **Regression**: predict 'productivity_drop_pct' (continuous)
2) **Classification**: predict 'stable vs unstable' using a threshhold

## Why both?
- Regression provides a continuous risk estimate (useful for ranking)
- Classification maps directly to a decision rule (drop or keep)

## Key constraints
- Use only early-passage-derived features (already done in Notebook 02)
- Avoid leakage (do not use late measurements)

## 01) Import libraries

In [68]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

## 02) Load ML dataset created in Notebook 02 - cell 08

In [69]:
DATA_PATH = "../data/synthetic/processed/cld_features_with_label.csv"
dataset = pd.read_csv(DATA_PATH)

dataset.head()

Unnamed: 0,clone_id,titer_mean,titer_std,titer_min,titer_max,vcd_mean,vcd_std,vcd_min,vcd_max,viability_mean,...,viability_max,aggregation_mean,aggregation_std,aggregation_min,aggregation_max,titer_slope,vcd_slope,viability_slope,aggregation_slope,productivity_drop_pct
0,CLONE_0001,2.538067,0.293036,2.223711,2.964514,11076190.0,905255.6,9779103.0,12322580.0,94.851455,...,97.386709,8.356812,0.403848,7.723455,8.89403,-0.103703,102268.665747,0.169734,-0.019225,0.387063
1,CLONE_0002,0.814721,0.213007,0.537981,1.132518,14410910.0,1047019.0,13460700.0,16053410.0,97.551824,...,99.573812,7.337188,0.449925,6.798531,8.011004,-0.005035,279733.381794,0.198861,-0.023819,0.135156
2,CLONE_0003,3.912552,0.208697,3.621956,4.261524,8684126.0,583412.0,7780120.0,9384241.0,94.390688,...,98.774683,2.235352,0.326064,1.745532,2.861836,-0.058117,51853.805966,0.505344,0.038583,0.335258
3,CLONE_0004,0.488369,0.160312,0.212916,0.747609,15117250.0,781766.5,14054530.0,16064250.0,96.380534,...,98.968096,3.86024,0.305128,3.357574,4.286918,-0.026671,278866.456374,0.504642,0.051834,0.590633
4,CLONE_0005,2.238289,0.160672,2.033612,2.459557,11171950.0,1167196.0,8874346.0,12390300.0,95.085238,...,96.870619,3.231717,0.405157,2.620058,4.001488,-0.034059,391064.583721,0.233446,-0.129212,0.291757


## 03) Prepare features (x) and target (y)

We drop clone_id from x and keep it separately for reference.

In [70]:
# Keep clone_id for later inspection
clone_id = dataset["clone_id"].copy()

# Target for regression
y_reg = dataset["productivity_drop_pct"].copy()
y_reg = y_reg.clip(lower=0.0, upper=1.0)

# Feature matrix
X = dataset.drop(columns=["clone_id", "productivity_drop_pct"])

# Simple NaN handling (should be minimal)
X = X.fillna(X.median(numeric_only=True))

print("X shape:", X.shape)
print("y_reg shape:", y_reg.shape)

X shape: (500, 20)
y_reg shape: (500,)


## 04) Train/test split

We hold out 20% for evaluation

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

## 05) Regression (baseline): Linear Regression
A simple baseline model

In [72]:
lr_scaled = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

lr_scaled.fit(X_train, y_train)
pred_lr = lr_scaled.predict(X_test)

mae = mean_absolute_error(y_test, pred_lr)
r2 = r2_score(y_test, pred_lr)

print(f"Scaled Linear Regression - MAE: {mae:.4f}")
print(f"Scaled Linear Regression - R2: {r2:.4f}")

Scaled Linear Regression - MAE: 0.0824
Scaled Linear Regression - R2: 0.0465


## 06) Regression (stronger baseline): Random Forest Regressor

Non-linear model that can caputre interactions between features.

In [73]:
rf = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    min_samples_leaf=5,
    max_features='sqrt'
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, pred_rf)
r2_rf = r2_score(y_test, pred_rf)

print("Random Forest MAE:", mae_rf)
print("Random Forest R2:", r2_rf)

Random Forest MAE: 0.08359412202009618
Random Forest R2: 0.029879706732028777


## 07) Classification label definition

We define stable vs unstable using a threshold on productivity drop.
Users can later change this threshold based on business / process requirements.

In [74]:
THRESHOLD = 0.30  # example: 30% drop cutoff

y_cls = (y_reg <= THRESHOLD).astype(int)  # 1 = stable, 0 = unstable

print("Class balance (1=stable):")
print(y_cls.value_counts(normalize=True))

Class balance (1=stable):
productivity_drop_pct
1    0.606
0    0.394
Name: proportion, dtype: float64


## 08) Classification split

In [75]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

## 09) Classification baseline: Logistic Regression

Works well for tabular features and provides interpretable coefficients.

In [76]:
logreg = LogisticRegression(max_iter=2000, class_weight='balanced')
logreg.fit(X_train_c, y_train_c)

proba = logreg.predict_proba(X_test_c)[:, 1]
pred_c = (proba >= 0.3).astype(int)

auc = roc_auc_score(y_test_c, proba)
acc = accuracy_score(y_test_c, pred_c)
prec = precision_score(y_test_c, pred_c)
rec = recall_score(y_test_c, pred_c)

print("Logistic Regression AUC:", auc)
print("Accuracy:", acc, "Precision:", prec, "Recall:", rec)
print("Confusion matrix:\n", confusion_matrix(y_test_c, pred_c))

Logistic Regression AUC: 0.5809163514081547
Accuracy: 0.58 Precision: 0.6021505376344086 Recall: 0.9180327868852459
Confusion matrix:
 [[ 2 37]
 [ 5 56]]


## 10) Classification: Random Forest

Non-linear classifier for potentially better performance.

In [77]:
rf_c = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)
rf_c.fit(X_train_c, y_train_c)

proba_rf = rf_c.predict_proba(X_test_c)[:, 1]
pred_rf_c = (proba_rf >= 0.3).astype(int)

auc_rf = roc_auc_score(y_test_c, proba_rf)
acc_rf = accuracy_score(y_test_c, pred_rf_c)
prec_rf = precision_score(y_test_c, pred_rf_c)
rec_rf = recall_score(y_test_c, pred_rf_c)

print("Random Forest AUC:", auc_rf)
print("Accuracy:", acc_rf, "Precision:", prec_rf, "Recall:", rec_rf)
print("Confusion matrix:\n", confusion_matrix(y_test_c, pred_rf_c))

Random Forest AUC: 0.7400168137873055
Accuracy: 0.59 Precision: 0.6020408163265306 Recall: 0.9672131147540983
Confusion matrix:
 [[ 0 39]
 [ 2 59]]


## 11) Feature importance (Random Forest)

This gives an initial sense of which early metrics drive predictions.

In [78]:
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.head(15)

titer_slope         0.091929
vcd_slope           0.083935
titer_std           0.062727
vcd_mean            0.053703
titer_mean          0.053443
vcd_std             0.051817
aggregation_mean    0.051454
titer_min           0.047806
aggregation_max     0.047365
titer_max           0.047248
aggregation_std     0.046167
vcd_min             0.044467
aggregation_min     0.044394
viability_max       0.043754
viability_std       0.042592
dtype: float64

## Summary

We trained:
- Regression models predicting continuous stability drop ('productivity_drop_pct')
- Classification models predicting stable vs unstable clones using a threshold

Next step (Notebook 04):
- Use the model predictions to simulate **early clone drop decision-making**
- Compare baseline vs ML-guided outcomes

In [79]:
y_reg.describe()

count    500.000000
mean       0.270309
std        0.126594
min        0.000000
25%        0.178165
50%        0.270368
75%        0.355440
max        0.755038
Name: productivity_drop_pct, dtype: float64

In [80]:
tmp = dataset[["titer_mean", "productivity_drop_pct"]].dropna()
tmp.corr(numeric_only=True)

Unnamed: 0,titer_mean,productivity_drop_pct
titer_mean,1.0,-0.064024
productivity_drop_pct,-0.064024,1.0
