In [35]:

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import os
import joblib

# Evidently imports
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, ClassificationPreset

In [36]:

data = pd.read_pickle('data_with_emb.pkl')
embeddings = np.load('../outputs/embeddings.npy')
model = joblib.load("../models/model.joblib")
le = joblib.load('../models/encoder.joblib')

print(embeddings.shape)

(20000, 384)



Trying to unpickle estimator SVC from version 1.7.2 when using version 1.4.1.post1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Trying to unpickle estimator LabelEncoder from version 1.7.2 when using version 1.4.1.post1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations



In [37]:
y_encoded = le.fit_transform(data['type'])

### Préparer les données pour le monitoring

In [38]:

df_monitoring = data.copy()


df_monitoring['emb_mean'] = embeddings.mean(axis=1)
df_monitoring['emb_std'] = embeddings.std(axis=1)
df_monitoring['emb_min'] = embeddings.min(axis=1)
df_monitoring['emb_max'] = embeddings.max(axis=1)


df_monitoring['target'] = y_encoded

In [39]:

reference_data, current_data = train_test_split(
    df_monitoring,
    test_size=0.3,  # 30% pour données actuelles
    random_state=42,
    stratify=df_monitoring['target']
)

In [40]:

ref_embeddings = embeddings[reference_data.index]
curr_embeddings = embeddings[current_data.index]

print(f"   ✓ Embeddings référence : {ref_embeddings.shape}")
print(f"   ✓ Embeddings actuels : {curr_embeddings.shape}")

# Prédictions sur référence
reference_data = reference_data.copy()
reference_data['prediction'] = model.predict(ref_embeddings)

ref_proba = model.predict_proba(ref_embeddings)
for i, classe in enumerate(le.classes_):
    reference_data[f'proba_{classe}'] = ref_proba[:, i]

# Prédictions sur données actuelles
current_data = current_data.copy()
current_data['prediction'] = model.predict(curr_embeddings)

curr_proba = model.predict_proba(curr_embeddings)
for i, classe in enumerate(le.classes_):
    current_data[f'proba_{classe}'] = curr_proba[:, i]

   ✓ Embeddings référence : (14000, 384)
   ✓ Embeddings actuels : (6000, 384)


In [41]:

# Colonnes à surveiller
columns_for_monitoring = [
    'target',
    'prediction',
    'emb_mean',
    'emb_std',
    'emb_min',
    'emb_max',
]

# Ajouter les probabilités
proba_cols = [col for col in reference_data.columns if col.startswith('proba_')]
columns_for_monitoring.extend(proba_cols)

# Créer datasets nettoyés
ref_clean = reference_data[columns_for_monitoring].copy()
curr_clean = current_data[columns_for_monitoring].copy()

In [42]:

column_mapping = ColumnMapping()
column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
column_mapping.numerical_features = ['emb_mean', 'emb_std', 'emb_min', 'emb_max']

print("   ✓ Mapping configuré")

   ✓ Mapping configuré


In [43]:

report = Report(metrics=[
    DataDriftPreset(),
    ClassificationPreset(),
])

# Exécuter le rapport
report.run(
    reference_data=ref_clean,
    current_data=curr_clean,
    column_mapping=column_mapping
)

In [None]:

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_path = f'../reports/evidently/monitoring_report_{timestamp}.html'

report.save_html(report_path)

In [None]:

baseline_path = '../reports/baseline_reference.csv'
ref_clean.to_csv(baseline_path, index=False)