In [145]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from evidently import Dataset, DataDefinition
from evidently import Report
from evidently.metrics import *
from evidently.presets import *
from pathlib import Path
from evidently.presets import DataDriftPreset, DataSummaryPreset

In [147]:
LOG_PATH = "logs/api_logger.log"
INPUT_DATA_PATH = "data/input_reference.csv"
OUTPUT_DATA_PATH = "data/output_reference.csv"
REPORTS_DIR = "reports"
os.makedirs(REPORTS_DIR, exist_ok=True)

In [149]:
# -----------------------------------------------------------------
# Chargement des Logs de l'Api
# -----------------------------------------------------------------

def load_api_logs():
    entries = []
    with open(LOG_PATH, "r") as f:
        for line in f:
            try:
                msg = line.strip().split(" - ", 2)[-1]
                log_json = json.loads(msg)
                if isinstance(log_json, dict):
                    entries.append(log_json)
            except:
                continue
    return pd.DataFrame(entries)

logs_df = load_api_logs()

if logs_df.empty:
    print("Aucun log structuré trouvé dans", LOG_PATH)
    exit()

In [151]:
# -----------------------------------------------------------------
# Création de inputs Dataframe les output Dataframe
# -----------------------------------------------------------------
pred_logs = logs_df[logs_df["event"] == "prediction"].copy()

# Extraire les features et les outputs
input_df = pd.json_normalize(pred_logs["input_data"])
input_df.reset_index(drop=True, inplace=True)

output_df = pd.DataFrame()
output_df["TARGET"] = pred_logs["prediction"].map({"Solvable": 0, "Défaillant": 1})
output_df.reset_index(drop=True, inplace=True)

# -----------------------------------------------------------------
# Chargement des données de référence (entraînement)
# -----------------------------------------------------------------

input_reference = pd.read_csv(INPUT_DATA_PATH)
output_reference = pd.read_csv(OUTPUT_DATA_PATH)

In [153]:
reference_data = input_reference.copy()
reference_data["TARGET"]= output_reference["TARGET"]

new_data = input_df.copy()
new_data["TARGET"] = output_df["TARGET"]

In [155]:
report = Report([DataDriftPreset()])
my_eval = report.run(new_data, reference_data)

In [156]:
# Sauvegarder le rapport HTML temporaire
html_path = Path("drift_report_temp.html")
my_eval.save_html(str(html_path))

In [159]:
# -----------------------------------------------------------------
# Analyse opérationnelle : taux d’erreur, latence, anomalies
# -----------------------------------------------------------------

http_logs = logs_df[logs_df["event"] == "http_request"].copy()

if http_logs.empty:
    print("Aucun log http_request trouvé dans", LOG_PATH)
else:
    http_logs["timestamp"] = pd.to_datetime(http_logs["timestamp"])
    http_logs["date"] = http_logs["timestamp"].dt.date
    http_logs["duration_ms"] = http_logs["duration"] * 1000

In [161]:
http_logs.head()

Unnamed: 0,timestamp,request_id,method,path,status_code,duration,client_ip,event,input_data,prediction,probabilité_defaut,probabilite_defaut,date,duration_ms
0,2025-10-21 13:07:41.144901,f3c6735d-115c-4a83-82a3-5989b736c89a,GET,/,200.0,0.002045,127.0.0.1,http_request,,,,,2025-10-21,2.045393
1,2025-10-21 13:07:53.098150,e0ac2cdc-0c46-4744-ae11-542795165d01,GET,/docs,200.0,0.000524,127.0.0.1,http_request,,,,,2025-10-21,0.524044
2,2025-10-21 13:07:53.246511,5d04a28f-ceed-4307-8032-748b4b7d56f0,GET,/openapi.json,200.0,0.016629,127.0.0.1,http_request,,,,,2025-10-21,16.628981
3,2025-10-21 13:08:00.453826,5930ae28-54eb-43bd-8667-776589cdcbe3,GET,/,200.0,0.000885,127.0.0.1,http_request,,,,,2025-10-21,0.88501
5,2025-10-21 13:08:23.865180,3e47a1ff-f510-41d9-a4ab-25bcb9a26c7d,POST,/predict,200.0,0.039597,127.0.0.1,http_request,,,,,2025-10-21,39.597034


In [163]:
# --- Métriques globales ---
total_requests = len(http_logs)
error_rate = (http_logs["status_code"] >= 400).mean() * 100
avg_latency = http_logs["duration_ms"].mean()
p95_latency = http_logs["duration_ms"].quantile(0.95)

print("===== Métriques globales =====")
print(f"Total requêtes : {total_requests}")
print(f"Taux d’erreur : {error_rate:.2f}%")
print(f"Latence moyenne : {avg_latency:.2f} ms")
print(f"Latence 95e percentile : {p95_latency:.2f} ms")

===== Métriques globales =====
Total requêtes : 338
Taux d’erreur : 7.69%
Latence moyenne : 10.31 ms
Latence 95e percentile : 27.62 ms


In [165]:
# --- Taux d’erreur par endpoint ---
error_by_path = (
    http_logs.groupby("path")["status_code"]
    .apply(lambda x: (x >= 400).mean() * 100)
    .reset_index(name="error_rate_%")
    )

print("\n===== Taux d’erreur par endpoint =====")
print(error_by_path)

# --- Détection de latences anormales ---
mean_dur = http_logs["duration_ms"].mean()
std_dur = http_logs["duration_ms"].std()

http_logs["latency_anomaly"] = http_logs["duration_ms"] > (mean_dur + 3 * std_dur)
anomalies = http_logs[http_logs["latency_anomaly"]]

if not anomalies.empty:
    print("\n===== Anomalies de latence détectées =====")
    print(anomalies[["timestamp", "method", "path", "duration_ms", "status_code"]])
else:
    print("\n Aucune anomalie de latence détectée.")


===== Taux d’erreur par endpoint =====
            path  error_rate_%
0              /      0.000000
1          /docs      0.000000
2   /favicon.ico     13.333333
3          /logs      0.000000
4  /openapi.json      0.000000
5       /predict     13.636364

===== Anomalies de latence détectées =====
                     timestamp method      path  duration_ms  status_code
16  2025-10-23 17:48:03.493141   POST  /predict    60.519934        200.0
296 2025-10-31 14:18:54.387511   POST  /predict    47.524214        200.0
333 2025-10-31 14:40:11.550318   POST  /predict    63.124895        200.0
