## Sample Dataset

In [2]:
import numpy as np, pandas as pd
rng = np.random.default_rng(42)

N = 12000     # cells
BATCHES = 12  # production batches
batch_id = rng.integers(1, BATCHES+1, size=N)

# Process parameters (think coating thickness, drying temp, calender pressure, etc.)
coat_thickness = rng.normal(80, 3, N) + (batch_id-6)*0.4
dry_temp = rng.normal(110, 5, N) + (batch_id-6)*0.8
cal_press = rng.normal(200, 10, N)
slurry_visc = rng.normal(950, 60, N)

# EOL electrical: DCIR, capacity, voltage drift, leakage current
dcir = rng.normal(35, 3, N) + 0.05*(coat_thickness-80) + 0.03*(dry_temp-110)
capacity = rng.normal(5.0, 0.15, N) - 0.004*(dcir-35) - 0.001*(slurry_visc-950)
v_drift = rng.normal(0.0, 0.015, N) + 0.0005*(dry_temp-110)
leak_uA = np.abs(rng.normal(8, 2, N) + 0.1*(dcir-35))

# Label: pass/fail based on multiple thresholds (nonlinear + noise)
z = (capacity > 4.85).astype(int) * (dcir < 39).astype(int) * (leak_uA < 12).astype(int)
y = np.where(rng.random(N) < 0.03, 1-z, z)  # inject label noise

df = pd.DataFrame({
    "batch_id": batch_id,
    "coat_thickness": coat_thickness,
    "dry_temp": dry_temp,
    "cal_press": cal_press,
    "slurry_visc": slurry_visc,
    "dcir_mOhm": dcir,
    "capacity_Ah": capacity,
    "volt_drift_V": v_drift,
    "leak_uA": leak_uA,
    "label_pass": y
})
df.head()


Unnamed: 0,batch_id,coat_thickness,dry_temp,cal_press,slurry_visc,dcir_mOhm,capacity_Ah,volt_drift_V,leak_uA,label_pass
0,2,75.289018,109.330023,199.514705,1004.697721,33.679468,5.053454,0.0075,8.205065,1
1,10,83.047906,122.499463,210.754528,934.070411,31.054683,5.046669,0.029819,7.161909,1
2,8,78.977413,104.938115,200.404815,914.567368,32.442401,4.919478,-0.007291,6.215346,1
3,6,75.052801,113.652607,209.285899,950.575466,37.257456,5.213707,-0.006751,9.180954,1
4,6,79.137587,114.834181,196.978549,947.359314,33.893907,4.949073,-0.003119,7.183038,1


## Train/validation/test split by batch (prevents leakage)

In [3]:
from sklearn.model_selection import GroupShuffleSplit

X = df.drop(columns=["label_pass"])
y = df["label_pass"].values
groups = df["batch_id"].values

gss = GroupShuffleSplit(n_splits=1, train_size=0.7, random_state=7)
train_idx, temp_idx = next(gss.split(X, y, groups))
X_train, y_train, g_train = X.iloc[train_idx], y[train_idx], groups[train_idx]

gss2 = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=7)
val_idx, test_idx = next(gss2.split(X.iloc[temp_idx], y[temp_idx], groups[temp_idx]))
val_idx, test_idx = temp_idx[val_idx], temp_idx[test_idx]

X_val, y_val = X.iloc[val_idx], y[val_idx]
X_test, y_test = X.iloc[test_idx], y[test_idx]


## Baseline modeling pipeline (scikit-learn Pipeline + PCA)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier

numeric = X.columns.tolist()

pre = ColumnTransformer([
    ("scale", StandardScaler(), numeric),
], remainder="drop")

model = Pipeline([
    ("pre", pre),
    ("pca", PCA(n_components=0.95, svd_solver="full")),  # keep 95% variance
    ("rf", RandomForestClassifier(
        n_estimators=300, max_depth=None, class_weight="balanced_subsample", random_state=7
    ))
])

model.fit(X_train, y_train)
proba_val = model.predict_proba(X_val)[:,1]
print("Val ROC-AUC:", roc_auc_score(y_val, proba_val))
print("Val PR-AUC :", average_precision_score(y_val, proba_val))


Val ROC-AUC: 0.9486027053622872
Val PR-AUC : 0.9659629969009783


## GridSearchCV Tuning

In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "rf__n_estimators": [200, 400],
    "rf__max_depth": [None, 10, 20],
    "rf__min_samples_leaf": [1, 3, 5]
}
gs = GridSearchCV(model, param_grid, scoring="roc_auc", cv=3, n_jobs=-1)
gs.fit(X_train, y_train)
best = gs.best_estimator_
print(gs.best_params_)


{'rf__max_depth': 20, 'rf__min_samples_leaf': 1, 'rf__n_estimators': 200}
