# Biologics Pharmacology AI/ML Workflow Notebook

This notebook provides an interactive, end-to-end walkthrough for building and validating data-driven models to support early decisions in biologics drug development.

## Notebook flow (separated stages)

1. **Imports**
2. **Preprocessing**
3. **Cleaning of data**
4. **Introducing a model**
5. **Training**
6. **Testing**
7. **Giving predictions**

This structure is intentionally separated so each step is easy to follow in Jupyter.

In [None]:
# 1) Imports
from pathlib import Path
import sys

import joblib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GroupShuffleSplit

PROJECT_ROOT = Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from biologics_pharmacology.decision_support import rank_candidates
from biologics_pharmacology.modeling import (
    BiologicsModelArtifacts,
    build_pipeline,
    run_grouped_cross_validation,
)
from biologics_pharmacology.schema import (
    CATEGORICAL_FEATURES,
    GROUP_COLUMN,
    ID_COLUMN,
    NUMERIC_FEATURES,
    TARGET_COLUMNS,
    feature_columns,
    required_columns,
    validate_dataset_schema,
)
from scripts.generate_synthetic_biologics_data import generate_dataset

sns.set_theme(style="whitegrid", context="notebook")
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 140)

## Raw data ingestion

Load existing integrated biologics data if present, otherwise generate synthetic data for demonstration.

In [None]:
data_path = Path("data/synthetic_biologics.csv")
data_path.parent.mkdir(parents=True, exist_ok=True)

if data_path.exists():
    raw_df = pd.read_csv(data_path)
    source = "loaded from existing CSV"
else:
    raw_df = generate_dataset(n_samples=500, seed=42)
    raw_df.to_csv(data_path, index=False)
    source = "generated and saved"

print(f"Dataset source: {source}")
print(f"Raw shape: {raw_df.shape}")
print(f"Unique molecule families: {raw_df[GROUP_COLUMN].nunique()}")

display(raw_df.head())

## 2) Preprocessing

In this step we standardize categorical text and coerce numeric columns to numeric dtype.

In [None]:
preprocessed_df = raw_df.copy()

for col in CATEGORICAL_FEATURES + [GROUP_COLUMN]:
    preprocessed_df[col] = (
        preprocessed_df[col]
        .astype(str)
        .str.strip()
        .str.lower()
    )

for col in NUMERIC_FEATURES + TARGET_COLUMNS:
    preprocessed_df[col] = pd.to_numeric(preprocessed_df[col], errors="coerce")

print("Missing values after type coercion (top 10):")
display(
    preprocessed_df[required_columns()]
    .isna()
    .sum()
    .sort_values(ascending=False)
    .head(10)
    .to_frame("missing_count")
)

display(preprocessed_df.head())

## 3) Cleaning of data

This step removes duplicate IDs, fills missing values, and applies basic range guards for percentage fields.

In [None]:
clean_df = preprocessed_df.copy()

rows_before = len(clean_df)
clean_df = clean_df.drop_duplicates(subset=[ID_COLUMN]).reset_index(drop=True)
rows_after = len(clean_df)

for col in NUMERIC_FEATURES + TARGET_COLUMNS:
    clean_df[col] = clean_df[col].fillna(clean_df[col].median())

for col in CATEGORICAL_FEATURES + [GROUP_COLUMN]:
    clean_df[col] = clean_df[col].fillna("unknown")

pct_cols = [col for col in clean_df.columns if col.endswith("_pct")]
clean_df[pct_cols] = clean_df[pct_cols].clip(lower=0, upper=100)

validate_dataset_schema(clean_df)

print(f"Rows before cleaning: {rows_before}")
print(f"Rows after cleaning:  {rows_after}")
print("Any missing values left in required columns:", clean_df[required_columns()].isna().sum().sum())

display(clean_df[required_columns()].describe(include="all").transpose().head(12))

## 4) Introducing a model

We use a multi-output pipeline:

- Imputation + one-hot encoding
- Random forest regressor wrapped for multi-target prediction
- Targets: PK half-life, PD response, and severe AE rate

In [None]:
model = build_pipeline()

print("Model pipeline object:")
display(model)

print("\nFeature summary")
print(f"- Numeric features: {len(NUMERIC_FEATURES)}")
print(f"- Categorical features: {len(CATEGORICAL_FEATURES)}")
print(f"- Total model features: {len(feature_columns())}")
print(f"- Targets: {TARGET_COLUMNS}")

## 5) Training

We split by molecule family using `GroupShuffleSplit` to avoid leakage across related molecules.

In [None]:
X = clean_df[feature_columns()].copy()
y = clean_df[TARGET_COLUMNS].copy()
groups = clean_df[GROUP_COLUMN]

splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(splitter.split(X, y, groups=groups))

train_df = clean_df.iloc[train_idx].reset_index(drop=True)
test_df = clean_df.iloc[test_idx].reset_index(drop=True)

model.fit(train_df[feature_columns()], train_df[TARGET_COLUMNS])

artifacts = BiologicsModelArtifacts(
    pipeline=model,
    feature_columns=feature_columns(),
    target_columns=TARGET_COLUMNS,
    group_column=GROUP_COLUMN,
)

model_path = Path("models/biologics_multitask_model.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(artifacts, model_path)

print(f"Training rows: {len(train_df)}")
print(f"Testing rows:  {len(test_df)}")
print(f"Saved trained artifact to: {model_path}")

## 6) Testing

We evaluate on a held-out test set and also report grouped cross-validation metrics.

In [None]:
test_predictions = pd.DataFrame(
    model.predict(test_df[feature_columns()]),
    columns=TARGET_COLUMNS,
    index=test_df.index,
)

holdout_metrics = []
for target in TARGET_COLUMNS:
    holdout_metrics.append(
        {
            "target": target,
            "holdout_mae": mean_absolute_error(test_df[target], test_predictions[target]),
            "holdout_r2": r2_score(test_df[target], test_predictions[target]),
        }
    )

holdout_metrics_df = pd.DataFrame(holdout_metrics).round(3)
print("Holdout test metrics:")
display(holdout_metrics_df)

cv_metrics = run_grouped_cross_validation(clean_df, n_splits=5)
cv_summary = cv_metrics.groupby("target")[["mae", "r2"]].agg(["mean", "std"]).round(3)
print("\nGrouped CV summary:")
display(cv_summary)

## 7) Giving predictions

Below we generate record-level predictions and a ranked candidate list for decision support.

In [None]:
test_results = test_df[[ID_COLUMN, GROUP_COLUMN] + TARGET_COLUMNS].copy()
for target in TARGET_COLUMNS:
    test_results[f"pred_{target}"] = test_predictions[target]

print("Sample predictions on held-out test set:")
display(
    test_results[
        [
            ID_COLUMN,
            "clinical_pk_half_life_day",
            "pred_clinical_pk_half_life_day",
            "clinical_pd_response_pct",
            "pred_clinical_pd_response_pct",
            "severe_ae_rate_pct",
            "pred_severe_ae_rate_pct",
        ]
    ].head(12).round(3)
)

candidate_table = clean_df[[ID_COLUMN] + feature_columns()].copy()
ranked_candidates = rank_candidates(
    artifacts=artifacts,
    candidates=candidate_table,
    top_k=15,
)

print("Top ranked candidates for early development decisions:")
display(
    ranked_candidates[
        [
            ID_COLUMN,
            "pred_clinical_pk_half_life_day",
            "pred_clinical_pd_response_pct",
            "pred_severe_ae_rate_pct",
            "decision_score",
        ]
    ].round(3)
)

### Optional visualization: feature importance by target

Because the model is multi-output, we inspect importances separately for PK, PD, and safety.

In [None]:
preprocessor = artifacts.pipeline.named_steps["preprocess"]
regressor = artifacts.pipeline.named_steps["regressor"]

feature_names = preprocessor.get_feature_names_out()

target_importance_tables = {}
fig, axes = plt.subplots(1, len(artifacts.target_columns), figsize=(20, 5))

for idx, (target, estimator) in enumerate(zip(artifacts.target_columns, regressor.estimators_)):
    importance_df = pd.DataFrame(
        {
            "feature": feature_names,
            "importance": estimator.feature_importances_,
        }
    ).sort_values("importance", ascending=False)

    target_importance_tables[target] = importance_df

    top_importance = importance_df.head(15).iloc[::-1]
    axes[idx].barh(top_importance["feature"], top_importance["importance"], color="#4C78A8")
    axes[idx].set_title(f"Top features: {target}")
    axes[idx].set_xlabel("Importance")
    axes[idx].tick_params(axis="y", labelsize=8)

plt.tight_layout()
plt.show()

for target in artifacts.target_columns:
    print(f"Top 10 features for {target}:")
    display(target_importance_tables[target].head(10).reset_index(drop=True))

### Decision landscape view

This plot shows how candidates balance efficacy, safety, and PK durability.

In [None]:
scored_all = rank_candidates(
    artifacts=artifacts,
    candidates=candidate_table,
    top_k=len(candidate_table),
)

plt.figure(figsize=(10, 7))
scatter = sns.scatterplot(
    data=scored_all,
    x="pred_clinical_pd_response_pct",
    y="pred_severe_ae_rate_pct",
    size="pred_clinical_pk_half_life_day",
    hue="decision_score",
    palette="viridis",
    sizes=(20, 220),
    alpha=0.75,
)
scatter.set_title("Candidate decision landscape")
scatter.set_xlabel("Predicted clinical PD response (%)")
scatter.set_ylabel("Predicted severe AE rate (%)")

for _, row in scored_all.head(5).iterrows():
    plt.annotate(
        row["molecule_id"],
        (row["pred_clinical_pd_response_pct"], row["pred_severe_ae_rate_pct"]),
        xytext=(5, 5),
        textcoords="offset points",
        fontsize=9,
    )

plt.tight_layout()
plt.show()

scored_all[
    [
        "molecule_id",
        "decision_score",
        "pred_clinical_pd_response_pct",
        "pred_clinical_pk_half_life_day",
        "pred_severe_ae_rate_pct",
    ]
].head(10).round(3)

## 8) Adapting this notebook to real R&D data

To move from synthetic to real biologics programs:

1. Replace `raw_df` with your governed integrated dataset.
2. Keep required schema columns from `biologics_pharmacology/schema.py`.
3. Preserve grouped validation by molecule family (or program-level grouping).
4. Tune model settings and decision-score weights with QP and clinical stakeholders.
5. Add uncertainty estimates before deployment into portfolio decisions.