# Phase 4: Results Analysis

This notebook analyzes and compares evaluation results from centralized and federated PatchCore models on the AutoVI dataset.

## Contents
1. Load Evaluation Results
2. Per-Object Performance Analysis
3. Method Comparison
4. Statistical Significance Testing
5. Visualizations
6. Key Findings & Conclusions

In [None]:
import json
import os
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.data.autovi_dataset import CATEGORIES
from src.evaluation.visualization import (
    plot_fpr_spro_curves,
    plot_comparison_bar_chart,
    plot_performance_heatmap,
    plot_box_comparison,
    compute_statistical_analysis,
    create_comparison_table,
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print(f"Project root: {project_root}")
print(f"Categories: {CATEGORIES}")

## 1. Load Evaluation Results

In [None]:
# Configuration
METRICS_DIR = project_root / "outputs" / "evaluation" / "metrics"
METHODS = ["centralized", "federated_iid", "federated_category"]
FPR_LIMIT = 0.05  # Default FPR limit for analysis

print(f"Metrics directory: {METRICS_DIR}")
print(f"Methods to analyze: {METHODS}")
print(f"FPR limit: {FPR_LIMIT}")

In [None]:
def load_metrics(metrics_dir, methods):
    """Load metrics from JSON files."""
    results = {}
    
    for method in methods:
        method_dir = metrics_dir / method
        if not method_dir.exists():
            print(f"Warning: {method_dir} not found")
            continue
            
        method_results = {}
        for obj_name in CATEGORIES:
            metrics_path = method_dir / obj_name / "metrics.json"
            if metrics_path.exists():
                with open(metrics_path) as f:
                    method_results[obj_name] = json.load(f)
        
        if method_results:
            results[method] = method_results
            print(f"Loaded {method}: {len(method_results)} objects")
    
    return results

# Load results
results = load_metrics(METRICS_DIR, METHODS)
print(f"\nLoaded results for {len(results)} methods")

## 2. Per-Object Performance Analysis

In [None]:
# Create comparison DataFrame
df = create_comparison_table(results, FPR_LIMIT)
print(f"\nComparison Table (AUC-sPRO @ FPR={FPR_LIMIT})")
df.style.format(precision=4)

In [None]:
# Extract values for analysis
def extract_auc_spro(results, method, fpr_limit=0.05):
    """Extract AUC-sPRO values for a method."""
    values = {}
    for obj, obj_results in results.get(method, {}).items():
        if "error" not in obj_results:
            auc_spro = obj_results.get("localization", {}).get("auc_spro", {})
            val = auc_spro.get(str(fpr_limit))
            if val is not None:
                values[obj] = val
    return values

# Get values for each method
method_values = {method: extract_auc_spro(results, method, FPR_LIMIT) for method in METHODS if method in results}

for method, values in method_values.items():
    print(f"\n{method}:")
    for obj, val in sorted(values.items()):
        print(f"  {obj}: {val:.4f}")

## 3. Method Comparison

In [None]:
# Compute aggregate statistics
print("Aggregate Statistics (AUC-sPRO @ FPR=0.05)")
print("="*50)

for method, values in method_values.items():
    if values:
        vals = list(values.values())
        print(f"\n{method}:")
        print(f"  Mean:   {np.mean(vals):.4f}")
        print(f"  Std:    {np.std(vals):.4f}")
        print(f"  Min:    {np.min(vals):.4f}")
        print(f"  Max:    {np.max(vals):.4f}")
        print(f"  Median: {np.median(vals):.4f}")

In [None]:
# Performance gap analysis
if "centralized" in method_values:
    print("\nPerformance Gap Analysis (vs Centralized)")
    print("="*50)
    
    centralized_vals = method_values["centralized"]
    
    for method in ["federated_iid", "federated_category"]:
        if method in method_values:
            method_vals = method_values[method]
            
            # Calculate gaps for common objects
            common_objs = set(centralized_vals.keys()) & set(method_vals.keys())
            gaps = []
            for obj in common_objs:
                gap = method_vals[obj] - centralized_vals[obj]
                gap_pct = gap / centralized_vals[obj] * 100
                gaps.append(gap_pct)
                print(f"  {obj}: {gap_pct:+.2f}%")
            
            print(f"\n  Mean gap ({method}): {np.mean(gaps):+.2f}%")

## 4. Statistical Significance Testing

In [None]:
# Compute full statistical analysis
stats_analysis = compute_statistical_analysis(results, FPR_LIMIT)

# Display descriptive statistics
print("Descriptive Statistics")
print("="*50)
for method, desc in stats_analysis.get("descriptive", {}).items():
    print(f"\n{method}:")
    for key, value in desc.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")

In [None]:
# Display pairwise comparisons
print("\nPairwise Statistical Comparisons")
print("="*50)

for comp_key, comp_data in stats_analysis.get("comparisons", {}).items():
    print(f"\n{comp_key.replace('_vs_', ' vs ')}:")
    print(f"  Mean difference: {comp_data['mean_diff']:.4f}")
    print(f"  Cohen's d (effect size): {comp_data['cohens_d']:.4f}")
    
    t_test = comp_data.get("paired_t_test", {})
    print(f"  Paired t-test: t={t_test.get('statistic', 0):.4f}, p={t_test.get('p_value', 0):.4f}")
    
    wilcoxon = comp_data.get("wilcoxon", {})
    if wilcoxon.get("p_value") is not None:
        print(f"  Wilcoxon test: W={wilcoxon.get('statistic', 0):.4f}, p={wilcoxon.get('p_value', 0):.4f}")
    
    # Significance interpretation
    p_val = t_test.get('p_value', 1)
    if p_val < 0.001:
        sig = "*** (p < 0.001)"
    elif p_val < 0.01:
        sig = "** (p < 0.01)"
    elif p_val < 0.05:
        sig = "* (p < 0.05)"
    else:
        sig = "n.s. (not significant)"
    print(f"  Significance: {sig}")

## 5. Visualizations

In [None]:
# Bar comparison chart
fig = plot_comparison_bar_chart(results, FPR_LIMIT, figsize=(14, 6))
plt.show()

In [None]:
# Performance heatmap
fig = plot_performance_heatmap(results, FPR_LIMIT, figsize=(12, 6))
plt.show()

In [None]:
# Box plot comparison
fig = plot_box_comparison(results, FPR_LIMIT, figsize=(10, 6))
plt.show()

In [None]:
# FPR-sPRO curves for each object
for obj_name in CATEGORIES:
    obj_results = {
        method: method_results.get(obj_name, {})
        for method, method_results in results.items()
    }
    
    # Check if we have data for this object
    if any(obj_results.values()):
        fig = plot_fpr_spro_curves(obj_results, obj_name, figsize=(8, 6))
        plt.show()

## 6. Per-Defect Type Analysis

In [None]:
# Analyze performance by defect type (structural vs logical)
print("Performance by Defect Type")
print("="*50)

for method, method_results in results.items():
    print(f"\n{method}:")
    
    structural_vals = []
    logical_vals = []
    
    for obj_name, obj_results in method_results.items():
        per_defect = obj_results.get("localization", {}).get("per_defect_type", {})
        
        for defect_type, auc_spro in per_defect.items():
            val = auc_spro.get(str(FPR_LIMIT))
            if val is not None:
                if "structural" in defect_type:
                    structural_vals.append(val)
                elif "logical" in defect_type:
                    logical_vals.append(val)
    
    if structural_vals:
        print(f"  Structural anomalies: {np.mean(structural_vals):.4f} (n={len(structural_vals)})")
    if logical_vals:
        print(f"  Logical anomalies:    {np.mean(logical_vals):.4f} (n={len(logical_vals)})")

## 7. Image-Level Classification Results

In [None]:
# Analyze AUC-ROC for image-level classification
print("Image-Level Classification (AUC-ROC)")
print("="*50)

for method, method_results in results.items():
    print(f"\n{method}:")
    
    roc_values = []
    
    for obj_name, obj_results in method_results.items():
        auc_roc = obj_results.get("classification", {}).get("auc_roc", {})
        mean_roc = auc_roc.get("mean")
        
        if mean_roc is not None:
            roc_values.append(mean_roc)
            print(f"  {obj_name}: {mean_roc:.4f}")
    
    if roc_values:
        print(f"  ---")
        print(f"  Mean: {np.mean(roc_values):.4f}")

## 8. Key Findings & Conclusions

In [None]:
# Generate summary findings
print("="*60)
print("KEY FINDINGS")
print("="*60)

if results:
    # 1. Best performing method
    method_means = {}
    for method, values in method_values.items():
        if values:
            method_means[method] = np.mean(list(values.values()))
    
    if method_means:
        best_method = max(method_means, key=method_means.get)
        print(f"\n1. BEST PERFORMING METHOD: {best_method}")
        print(f"   Mean AUC-sPRO @ FPR={FPR_LIMIT}: {method_means[best_method]:.4f}")
    
    # 2. Performance gap
    if "centralized" in method_means and len(method_means) > 1:
        print(f"\n2. FEDERATED vs CENTRALIZED GAP:")
        cent_mean = method_means["centralized"]
        
        for method in ["federated_iid", "federated_category"]:
            if method in method_means:
                gap = method_means[method] - cent_mean
                gap_pct = gap / cent_mean * 100
                print(f"   {method}: {gap_pct:+.1f}% ({gap:+.4f})")
    
    # 3. Statistical significance
    print(f"\n3. STATISTICAL SIGNIFICANCE:")
    for comp_key, comp_data in stats_analysis.get("comparisons", {}).items():
        p_val = comp_data.get("paired_t_test", {}).get("p_value", 1)
        is_sig = "YES" if p_val < 0.05 else "NO"
        print(f"   {comp_key.replace('_vs_', ' vs ')}: {is_sig} (p={p_val:.4f})")
    
    # 4. Effect sizes
    print(f"\n4. EFFECT SIZES (Cohen's d):")
    for comp_key, comp_data in stats_analysis.get("comparisons", {}).items():
        d = comp_data.get("cohens_d", 0)
        if abs(d) < 0.2:
            effect = "negligible"
        elif abs(d) < 0.5:
            effect = "small"
        elif abs(d) < 0.8:
            effect = "medium"
        else:
            effect = "large"
        print(f"   {comp_key.replace('_vs_', ' vs ')}: d={d:.2f} ({effect})")

print("\n" + "="*60)

In [None]:
# Save results summary
summary = {
    "fpr_limit": FPR_LIMIT,
    "method_means": method_means if 'method_means' in dir() else {},
    "statistical_analysis": stats_analysis,
    "comparison_table": df.to_dict() if 'df' in dir() else {},
}

# Optionally save to file
# output_path = project_root / "outputs" / "analysis_summary.json"
# with open(output_path, "w") as f:
#     json.dump(summary, f, indent=2)
# print(f"Summary saved to {output_path}")