
# 🧪 Lab: Model Monitoring with Evidently AI (v0.7 API)

This lab demonstrates **model monitoring** for a classification model using the **Evidently 0.7** API:
- **Data definition** (`Dataset`, `DataDefinition`, `BinaryClassification`)
- **Presets**: `DataDriftPreset`, `ClassificationPreset`
- **Metrics**: `DriftedColumnsCount`, `Accuracy`
- Batch monitoring with **per-batch HTML reports** and **simple alerts**

**Created:** 2025-09-15 (UTC)


## 1) Setup

In [6]:

# If needed, install Evidently. (Skip if already installed at 0.7+)
# %pip install -U evidently


## 2) Imports

In [7]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

import evidently
from evidently import Dataset, DataDefinition, BinaryClassification, Report
from evidently.presets import DataDriftPreset, ClassificationPreset
from evidently.metrics import DriftedColumnsCount, Accuracy

print("evidently.__version__ =", evidently.__version__)


evidently.__version__ = 0.7.14


## 3) Load dataset

In [8]:

ds = load_breast_cancer(as_frame=True)
df = ds.frame.rename(columns={'target':'label'}).copy()
feature_names = [c for c in df.columns if c != 'label']
df.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## 4) Train a classifier and define **reference** data

In [9]:

ref_df, prod_pool = train_test_split(df, test_size=0.5, random_state=7, stratify=df['label'])
X_ref, y_ref = ref_df[feature_names], ref_df['label']

clf = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=500, random_state=7))])
clf.fit(X_ref, y_ref)

ref_df = ref_df.copy()
ref_df['pred_proba'] = clf.predict_proba(X_ref)[:,1]
ref_df['prediction'] = (ref_df['pred_proba'] >= 0.5).astype(int)

print("Reference size:", ref_df.shape)
print("Reference AUC:", roc_auc_score(y_ref, ref_df['pred_proba']))


Reference size: (284, 33)
Reference AUC: 0.9971380114479542


## 5) Build `Dataset` with `DataDefinition` (new API)

In [10]:

# Define binary classification mapping
data_def = DataDefinition(
    classification=[BinaryClassification(
        target="label",
        prediction_labels="prediction",      # predicted class labels
        prediction_probas="pred_proba",     # predicted probability for positive class
        pos_label=1
    )]
)

# Create Evidently Dataset objects
ref_eval = Dataset.from_pandas(ref_df, data_definition=data_def)
ref_eval


<evidently.core.datasets.PandasDataset at 0x7a4fe95bb6e0>

## 6) Simulate **production batches** with drift

In [11]:

RNG = np.random.default_rng(42)

def induce_feature_shift(df_in: pd.DataFrame, shift_cols, shift_by=0.35, scale=1.2, rng=None):
    rng = rng or np.random.default_rng(0)
    df_out = df_in.copy()
    for c in shift_cols:
        if pd.api.types.is_numeric_dtype(df_out[c]):
            df_out[c] = df_out[c] + rng.normal(loc=shift_by, scale=0.1*scale, size=len(df_out))
    return df_out

def flip_labels(df_in: pd.DataFrame, flip_rate=0.1, rng=None):
    rng = rng or np.random.default_rng(0)
    df_out = df_in.copy()
    if flip_rate > 0:
        m = rng.random(len(df_out)) < flip_rate
        df_out.loc[m, 'label'] = 1 - df_out.loc[m, 'label']
    return df_out

N_BATCHES = 6
batch_size = int(np.ceil(len(prod_pool)/N_BATCHES))
batches = []
start = 0
for i in range(N_BATCHES):
    batch = prod_pool.iloc[start:start+batch_size].copy()
    start += batch_size
    if i >= 2:
        batch = induce_feature_shift(batch, feature_names[:5], rng=RNG)
    if i >= 4:
        batch = flip_labels(batch, 0.10, rng=RNG)
    Xb = batch[feature_names]
    batch['pred_proba'] = clf.predict_proba(Xb)[:,1]
    batch['prediction'] = (batch['pred_proba'] >= 0.5).astype(int)
    batch['batch_id'] = i+1
    batches.append(batch)

len(batches), [len(b) for b in batches]


(6, [48, 48, 48, 48, 48, 45])

## 7) Single-batch reports: Data Drift + Classification Presets

In [12]:

OUTPUT_DIR = Path('evidently_reports_v07')
OUTPUT_DIR.mkdir(exist_ok=True)

batch1 = Dataset.from_pandas(batches[0], data_definition=data_def)

# Data Drift (needs current and reference)
rep_drift = Report([DataDriftPreset()])
drift_eval = rep_drift.run(batch1, ref_eval)
drift_html = OUTPUT_DIR / 'data_drift_batch1_vs_ref.html'
rep_drift.save_html(str(drift_html))
print("Saved:", drift_html.resolve())

# Classification Quality (can be single dataset or compared to ref)
rep_cls = Report([ClassificationPreset()])
cls_eval = rep_cls.run(batch1, ref_eval)
cls_html = OUTPUT_DIR / 'classification_batch1_vs_ref.html'
rep_cls.save_html(str(cls_html))
print("Saved:", cls_html.resolve())


ValueError: Column (batch_id) is partially present in data

## 8) Batch monitoring loop with **metrics** and alerts

In [13]:

records = []
DRIFT_ALERT_THRESHOLD = 0.3
ACCURACY_ALERT_DROP = 0.08

ref_acc = accuracy_score(y_ref, ref_df['prediction'])
ref_auc = roc_auc_score(y_ref, ref_df['pred_proba'])
print(f"Reference Accuracy={ref_acc:.3f}, AUC={ref_auc:.3f}")

for batch_df in batches:
    batch_eval = Dataset.from_pandas(batch_df, data_definition=data_def)

    # Compute share of drifted columns
    rep_share = Report([DriftedColumnsCount()])
    share_eval = rep_share.run(batch_eval, ref_eval)
    share_json = share_eval.json()
    # simple extraction: the result object stores metrics in order; DriftedColumnsCount returns 'share'
    share_drifted = None
    try:
        share_drifted = share_json['metrics'][0]['result']['share']
    except Exception:
        # fallback in case of schema change
        share_drifted = np.nan

    # Compute accuracy on this batch using Evidently metric for consistency
    rep_acc = Report([Accuracy()])
    acc_eval = rep_acc.run(batch_eval, None)
    acc_json = acc_eval.json()
    acc = None
    try:
        acc = acc_json['metrics'][0]['result']['value']
    except Exception:
        # fallback to sklearn
        acc = accuracy_score(batch_df['label'], batch_df['prediction'])

    # Simple alert logic
    alert = False
    reasons = []
    if share_drifted is not None and not np.isnan(share_drifted) and share_drifted >= DRIFT_ALERT_THRESHOLD:
        alert = True
        reasons.append(f"drifted_share={share_drifted:.2f} ≥ {DRIFT_ALERT_THRESHOLD}")
    if acc <= (ref_acc - ACCURACY_ALERT_DROP):
        alert = True
        reasons.append(f"accuracy_drop={ref_acc-acc:.2f} ≥ {ACCURACY_ALERT_DROP}")

    # Save combined report for the batch
    rep_batch = Report([DataDriftPreset(), ClassificationPreset()])
    _ = rep_batch.run(batch_eval, ref_eval)
    out_html = OUTPUT_DIR / f"batch{int(batch_df['batch_id'].iloc[0])}_report.html"
    rep_batch.save_html(str(out_html))

    records.append({
        'batch_id': int(batch_df['batch_id'].iloc[0]),
        'rows': len(batch_df),
        'share_drifted_columns': share_drifted,
        'accuracy': float(acc),
        'alert': alert,
        'reasons': "; ".join(reasons)
    })

monitor_df = pd.DataFrame(records).sort_values('batch_id')
monitor_df


Reference Accuracy=0.989, AUC=0.997


ValueError: Column (batch_id) is partially present in data

## 9) Visualize drift share and accuracy

In [None]:

plt.figure(figsize=(7,4))
plt.plot(monitor_df['batch_id'], monitor_df['share_drifted_columns'], marker='o')
plt.axhline(0.3, linestyle='--')
plt.title('Share of Drifted Columns over Batches')
plt.xlabel('Batch ID'); plt.ylabel('Share drifted'); plt.grid(True)

plt.figure(figsize=(7,4))
plt.plot(monitor_df['batch_id'], monitor_df['accuracy'], marker='o')
plt.axhline(monitor_df['accuracy'].iloc[0]-0.08, linestyle='--')
plt.title('Accuracy over Batches')
plt.xlabel('Batch ID'); plt.ylabel('Accuracy'); plt.grid(True)

monitor_df


## 10) Exercises


1. Change drift method/thresholds, e.g.:
   ```python
   Report([DataDriftPreset(method="psi", drift_share=0.7)])
   ```
2. Add `include_tests=True` to the `Report(...)` constructor to enable pass/fail checks in the same HTML.
3. Adjust the decision threshold for classification (e.g., 0.4) and observe the impact on alerts.
4. Replace the dataset with your own production logs and re-map with `DataDefinition`.
5. Export `monitor_df` to CSV and connect to a dashboard.
