# Drift Detector Benchmark Evaluation

**Purpose**: Unified evaluation notebook for both real-world and synthetic datasets

**Modes**:
- **Standard Mode**: Real-world datasets with ACCURACY, RUNTIME, REQLABELS metrics
- **MTR Mode**: Synthetic datasets with RUNTIME and MTR (Mean Time Ratio) metrics

**Last Updated**: 2025-10-01

---

## 1. Setup and Configuration

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from multiprocessing import Pool
import multiprocessing

# Shared configuration
from eval_config import (
    REAL_DATASETS,
    SYNTHETIC_DATASETS,
    ALL_DETECTORS,
    DETECTOR_COLORS,
    SINGLE_VARIATE_DETECTORS,
    BASE_PATH,
    setup_plot_style
)

# Setup plotting
setup_plot_style()
%matplotlib inline

In [None]:
# ============================================================================
# CONFIGURATION: Choose evaluation mode
# ============================================================================

# Set to True for MTR evaluation (synthetic datasets), False for standard evaluation
MTR_MODE = False  # Change to True for MTR evaluation

# Configuration based on mode
if MTR_MODE:
    # MTR Mode: Synthetic datasets
    datasets = SYNTHETIC_DATASETS  # ["WaveformPre", "SineClustersPre"]
    metric1 = "RUNTIME"
    metric2 = "MTR"
    metric3 = ""
    mode_name = "MTR (Mean Time Ratio)"
else:
    # Standard Mode: Real-world datasets
    datasets = REAL_DATASETS
    metric1 = "ACCURACY"
    metric2 = "RUNTIME"
    metric3 = ""  # Can add "REQLABELS" if needed
    mode_name = "Standard (Accuracy/Runtime)"

# Common configuration
detectors = ALL_DETECTORS
base_path = BASE_PATH
classifier = "HoeffdingTreeClassifier"

print(f"=== Evaluation Mode: {mode_name} ===")
print(f"Datasets: {datasets}")
print(f"Detectors: {len(detectors)} configured")
print(f"Metrics: {metric1}, {metric2}" + (f", {metric3}" if metric3 else ""))
print(f"Base Path: {base_path}")

## 2. Data Loading and Experiment Status

In [None]:
def process_detector_dataset(dd, datasets, base_path, metric1, metric2, metric3, mtr_mode=False):
    """
    Process a detector across multiple datasets and collect status information.
    
    Args:
        dd: Drift detector name
        datasets: List of dataset names
        base_path: Base path to experiment results
        metric1, metric2, metric3: Metric names
        mtr_mode: If True, use MTR-specific processing
        
    Returns:
        Tuple of (detector_name, status_dict)
        status_dict: {dataset: [evaluated_count, completed_count]}
    """
    dd_dataset_status = {}
    
    for dataset in datasets:
        dd_dataset_status[dataset] = [0, 0]
        
        # Construct experiment path
        exp_path = f"{base_path}{dd}_{dataset}_{classifier}_{metric1}-{metric2}"
        if metric3:
            exp_path += f"-{metric3}"
        
        if not os.path.exists(exp_path):
            print(f"Experiment not found: {dd}, {dataset}")
            continue
        
        # Find latest run
        try:
            nbr_runs = [int(x) for x in os.listdir(exp_path) if x.isdigit()]
            if not nbr_runs:
                continue
            run_path = os.path.join(exp_path, str(max(nbr_runs)), "results.csv")
        except Exception as e:
            print(f"Error finding runs for {dd}, {dataset}: {e}")
            continue
        
        if not os.path.exists(run_path):
            continue
        
        try:
            # Load results
            df = pd.read_csv(run_path)
            
            # Filter out abandoned trials
            df = df[df["trial_status"] != "ABANDONED"]
            df_completed = df[df["trial_status"] == "COMPLETED"]
            
            evaluated = len(df)
            completed = len(df_completed)
            
            dd_dataset_status[dataset][0] = evaluated
            dd_dataset_status[dataset][1] = completed
            
            # Check for timeout jobs (if many evaluations but few completions)
            if evaluated > 500 and completed < 0.1 * evaluated and completed < 500:
                # Analyze timeout patterns
                not_in_timeout = 0
                total_checked = 0
                
                nbr_runs_sorted = sorted(nbr_runs, reverse=True)
                
                for nbr_run in nbr_runs_sorted[:5]:  # Check last 5 runs
                    single_runs_path = os.path.join(
                        exp_path, str(nbr_run), "single_runs"
                    )
                    
                    if os.path.isdir(single_runs_path):
                        single_run_jobs = [
                            name for name in os.listdir(single_runs_path)
                            if os.path.isdir(os.path.join(single_runs_path, name))
                        ]
                        
                        for job in single_run_jobs[:50]:  # Check up to 50 jobs
                            log_file = os.path.join(
                                single_runs_path, job, f"{job}_0_log.out"
                            )
                            
                            if os.path.exists(log_file):
                                total_checked += 1
                                with open(log_file, 'r') as f:
                                    log_content = f.read()
                                    # Count non-timeout jobs
                                    if "DUE TO TIME LIMIT" not in log_content or \
                                       ("DUE TO TIME LIMIT" in log_content and "oom_kill" in log_content):
                                        not_in_timeout += 1
                        
                        if total_checked >= 200:
                            break
                
                # If >80% are timeouts, flag it
                if total_checked > 0 and not_in_timeout / total_checked < 0.2:
                    print(f"⚠️  Timeout job detected: {dd} on {dataset} "
                          f"({not_in_timeout}/{total_checked} non-timeout)")
        
        except Exception as e:
            print(f"Error processing {dd} on {dataset}: {e}")
    
    return dd, dd_dataset_status

In [None]:
# Count experiments using parallel processing
print(f"\n=== Counting Experiments ===")
print(f"Using {multiprocessing.cpu_count()} CPUs for parallel processing\n")

with Pool(multiprocessing.cpu_count()) as pool:
    results = pool.starmap(
        process_detector_dataset,
        [(dd, datasets, base_path, metric1, metric2, metric3, MTR_MODE) 
         for dd in detectors]
    )

# Organize results
detector_status = {dd: status for dd, status in results}

print(f"\n=== Experiment Status Summary ===")
print(detector_status)

## 3. Summary Statistics

In [None]:
# Calculate summary statistics
total_combinations = len(detectors) * len(datasets)
total_evaluated = 0
total_completed = 0

for dd, status in detector_status.items():
    for dataset, counts in status.items():
        if counts[0] > 0:  # Has evaluations
            total_evaluated += 1
        if counts[1] >= 500:  # Has sufficient completions
            total_completed += 1

print(f"\n=== Overall Statistics ===")
print(f"Total detector-dataset combinations: {total_combinations}")
print(f"Combinations with evaluations: {total_evaluated} ({100*total_evaluated/total_combinations:.1f}%)")
print(f"Combinations with ≥500 completions: {total_completed} ({100*total_completed/total_combinations:.1f}%)")

# Per-detector summary
print(f"\n=== Per-Detector Summary ===")
detector_summary = []

for dd in detectors:
    status = detector_status.get(dd, {})
    datasets_evaluated = sum(1 for counts in status.values() if counts[0] > 0)
    datasets_completed = sum(1 for counts in status.values() if counts[1] >= 500)
    
    detector_summary.append({
        'Detector': dd,
        'Datasets Evaluated': datasets_evaluated,
        'Datasets Completed': datasets_completed,
        'Completion Rate': f"{100*datasets_completed/len(datasets):.1f}%" if len(datasets) > 0 else "0%"
    })

summary_df = pd.DataFrame(detector_summary)
summary_df = summary_df.sort_values('Datasets Completed', ascending=False)
print(summary_df.to_string(index=False))

## 4. Visualization: Experiment Completion Status

In [None]:
# Create heatmap of completion status
completion_matrix = np.zeros((len(detectors), len(datasets)))

for i, dd in enumerate(detectors):
    for j, dataset in enumerate(datasets):
        status = detector_status.get(dd, {}).get(dataset, [0, 0])
        completion_matrix[i, j] = status[1]  # Completed count

# Plot heatmap
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(
    completion_matrix,
    xticklabels=datasets,
    yticklabels=detectors,
    annot=True,
    fmt='.0f',
    cmap='YlGnBu',
    cbar_kws={'label': 'Completed Experiments'},
    ax=ax
)
ax.set_title(f'Experiment Completion Status - {mode_name}', fontsize=14, pad=20)
ax.set_xlabel('Dataset', fontsize=12)
ax.set_ylabel('Drift Detector', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 5. Threshold Analysis

Analyze how many detector-dataset combinations meet different completion thresholds.

In [None]:
# Analyze completion at different thresholds
thresholds = list(range(0, 501, 50))
thresholds[0] = 1  # Replace 0 with 1
threshold_counts = []

for threshold in thresholds:
    count = 0
    for dd in detectors:
        for dataset in datasets:
            status = detector_status.get(dd, {}).get(dataset, [0, 0])
            if status[1] >= threshold:  # Completed count >= threshold
                count += 1
    threshold_counts.append(count)

# Plot threshold analysis
fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(range(len(thresholds)), threshold_counts, color='steelblue', alpha=0.7)

# Add value labels on bars
for i, (bar, count) in enumerate(zip(bars, threshold_counts)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{count}',
            ha='center', va='bottom', fontsize=9)

ax.set_xlabel('Minimum Completed Experiments', fontsize=12)
ax.set_ylabel('Number of Detector-Dataset Combinations', fontsize=12)
ax.set_title(f'Completion Threshold Analysis - {mode_name}', fontsize=14, pad=20)
ax.set_xticks(range(len(thresholds)))
ax.set_xticklabels([str(t) for t in thresholds], rotation=45)
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n=== Threshold Analysis ===")
for threshold, count in zip(thresholds, threshold_counts):
    print(f"≥{threshold:3d} completions: {count:3d}/{total_combinations} "
          f"({100*count/total_combinations:5.1f}%)")

## 6. Detailed Status by Dataset

In [None]:
# Show detailed status for each dataset
for dataset in datasets:
    print(f"\n{'='*60}")
    print(f"Dataset: {dataset}")
    print(f"{'='*60}")
    
    dataset_data = []
    for dd in detectors:
        status = detector_status.get(dd, {}).get(dataset, [0, 0])
        evaluated, completed = status
        
        if evaluated > 0:
            completion_rate = 100 * completed / evaluated
            dataset_data.append({
                'Detector': dd,
                'Evaluated': evaluated,
                'Completed': completed,
                'Rate': f"{completion_rate:.1f}%"
            })
    
    if dataset_data:
        dataset_df = pd.DataFrame(dataset_data)
        dataset_df = dataset_df.sort_values('Completed', ascending=False)
        print(dataset_df.to_string(index=False))
    else:
        print("No experiments found for this dataset.")

## 7. Identify Missing or Incomplete Experiments

In [None]:
# Find experiments that need attention
missing_experiments = []
incomplete_experiments = []

for dd in detectors:
    for dataset in datasets:
        status = detector_status.get(dd, {}).get(dataset, [0, 0])
        evaluated, completed = status
        
        if evaluated == 0:
            missing_experiments.append((dd, dataset))
        elif completed < 500 and evaluated > 0:
            incomplete_experiments.append((dd, dataset, evaluated, completed))

print(f"\n=== Missing Experiments (Not Started) ===")
print(f"Total: {len(missing_experiments)}")
if missing_experiments:
    for dd, dataset in missing_experiments[:20]:  # Show first 20
        print(f"  - {dd} on {dataset}")
    if len(missing_experiments) > 20:
        print(f"  ... and {len(missing_experiments) - 20} more")
else:
    print("  None! All experiments have been started.")

print(f"\n=== Incomplete Experiments (<500 completions) ===")
print(f"Total: {len(incomplete_experiments)}")
if incomplete_experiments:
    incomplete_df = pd.DataFrame(
        incomplete_experiments,
        columns=['Detector', 'Dataset', 'Evaluated', 'Completed']
    )
    incomplete_df = incomplete_df.sort_values('Completed', ascending=True)
    print(incomplete_df.head(20).to_string(index=False))
    if len(incomplete_experiments) > 20:
        print(f"\n  ... and {len(incomplete_experiments) - 20} more")
else:
    print("  None! All started experiments have ≥500 completions.")

## 8. Summary and Conclusions

In [None]:
print(f"\n{'='*60}")
print(f"EVALUATION SUMMARY - {mode_name}")
print(f"{'='*60}")
print(f"\nConfiguration:")
print(f"  - Mode: {mode_name}")
print(f"  - Datasets: {len(datasets)} ({', '.join(datasets[:3])}...)")
print(f"  - Detectors: {len(detectors)}")
print(f"  - Metrics: {metric1}, {metric2}" + (f", {metric3}" if metric3 else ""))
print(f"\nResults:")
print(f"  - Total combinations: {total_combinations}")
print(f"  - With evaluations: {total_evaluated} ({100*total_evaluated/total_combinations:.1f}%)")
print(f"  - With ≥500 completions: {total_completed} ({100*total_completed/total_combinations:.1f}%)")
print(f"  - Missing (not started): {len(missing_experiments)}")
print(f"  - Incomplete (<500): {len(incomplete_experiments)}")
print(f"\nNext Steps:")
if missing_experiments:
    print(f"  1. Start {len(missing_experiments)} missing experiments")
if incomplete_experiments:
    print(f"  2. Continue {len(incomplete_experiments)} incomplete experiments")
if not missing_experiments and not incomplete_experiments:
    print(f"  ✓ All experiments complete! Ready for analysis.")
print(f"\n{'='*60}")