# Model Evaluation Techniques

This notebook covers comprehensive evaluation strategies for genomic language models using the Hyena-GLT framework.

## Topics Covered:
1. Evaluation Metrics
2. Benchmarking Strategies
3. Statistical Analysis
4. Visualization
5. Performance Profiling
6. Comparison with Baselines

In [None]:
import sys
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    confusion_matrix,
    precision_recall_fscore_support,
    roc_auc_score,
)

# Add project root to path
project_root = Path().absolute().parent
sys.path.append(str(project_root))


# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("✅ Imports successful!")

## 1. Evaluation Metrics Overview

Different metrics for different genomic tasks:

In [None]:
class GenomicMetrics:
    """Comprehensive metrics for genomic sequence tasks."""

    @staticmethod
    def sequence_classification_metrics(y_true, y_pred, y_scores=None):
        """Calculate classification metrics."""
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='weighted'
        )

        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

        if y_scores is not None:
            if len(np.unique(y_true)) == 2:  # Binary classification
                metrics['auc_roc'] = roc_auc_score(y_true, y_scores)
                metrics['auc_pr'] = average_precision_score(y_true, y_scores)

        return metrics

    @staticmethod
    def sequence_generation_metrics(generated_seqs, reference_seqs):
        """Calculate generation quality metrics."""
        metrics = {}

        # GC content similarity
        gc_content_gen = [GenomicMetrics._gc_content(seq) for seq in generated_seqs]
        gc_content_ref = [GenomicMetrics._gc_content(seq) for seq in reference_seqs]
        metrics['gc_content_mse'] = np.mean((np.array(gc_content_gen) - np.array(gc_content_ref))**2)

        # K-mer distribution similarity
        for k in [3, 4, 5]:
            kmer_sim = GenomicMetrics._kmer_similarity(generated_seqs, reference_seqs, k)
            metrics[f'{k}_mer_similarity'] = kmer_sim

        # Sequence diversity
        metrics['diversity'] = GenomicMetrics._sequence_diversity(generated_seqs)

        return metrics

    @staticmethod
    def _gc_content(sequence):
        """Calculate GC content of a sequence."""
        sequence = sequence.upper()
        gc_count = sequence.count('G') + sequence.count('C')
        return gc_count / len(sequence) if len(sequence) > 0 else 0

    @staticmethod
    def _kmer_similarity(seqs1, seqs2, k):
        """Calculate k-mer distribution similarity."""
        def get_kmer_freq(sequences, k):
            kmer_counts = {}
            total_kmers = 0
            for seq in sequences:
                for i in range(len(seq) - k + 1):
                    kmer = seq[i:i+k]
                    kmer_counts[kmer] = kmer_counts.get(kmer, 0) + 1
                    total_kmers += 1
            return {kmer: count/total_kmers for kmer, count in kmer_counts.items()}

        freq1 = get_kmer_freq(seqs1, k)
        freq2 = get_kmer_freq(seqs2, k)

        # Jensen-Shannon divergence
        all_kmers = set(freq1.keys()) | set(freq2.keys())
        p = np.array([freq1.get(kmer, 0) for kmer in all_kmers])
        q = np.array([freq2.get(kmer, 0) for kmer in all_kmers])

        # Add small epsilon to avoid log(0)
        p = p + 1e-10
        q = q + 1e-10
        p = p / p.sum()
        q = q / q.sum()

        m = (p + q) / 2
        js_div = 0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m))
        return 1 - js_div  # Convert to similarity

    @staticmethod
    def _sequence_diversity(sequences):
        """Calculate sequence diversity."""
        if len(sequences) <= 1:
            return 0

        # Pairwise edit distance diversity
        total_distance = 0
        comparisons = 0

        for i in range(len(sequences)):
            for j in range(i+1, min(i+100, len(sequences))):  # Limit comparisons
                distance = GenomicMetrics._edit_distance(sequences[i], sequences[j])
                total_distance += distance
                comparisons += 1

        return total_distance / comparisons if comparisons > 0 else 0

    @staticmethod
    def _edit_distance(s1, s2):
        """Calculate edit distance between two sequences."""
        if len(s1) > len(s2):
            s1, s2 = s2, s1

        distances = range(len(s1) + 1)
        for i2, c2 in enumerate(s2):
            new_distances = [i2 + 1]
            for i1, c1 in enumerate(s1):
                if c1 == c2:
                    new_distances.append(distances[i1])
                else:
                    new_distances.append(1 + min(distances[i1], distances[i1 + 1], new_distances[-1]))
            distances = new_distances
        return distances[-1]

# Example metrics calculation
print("🔬 Genomic Metrics Overview")
print("=" * 40)

# Simulate classification results
np.random.seed(42)
y_true = np.random.choice([0, 1], size=1000, p=[0.6, 0.4])
y_scores = np.random.beta(2, 5, size=1000)
y_pred = (y_scores > 0.3).astype(int)

metrics = GenomicMetrics()
classification_metrics = metrics.sequence_classification_metrics(y_true, y_pred, y_scores)

print("Classification Metrics:")
for metric, value in classification_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Simulate generation results
generated = ['ATCGATCG' * 10, 'GCTAGCTA' * 10, 'TTAAGGCC' * 10]
reference = ['ATCGATCG' * 10, 'GCTAGCTA' * 10, 'AATTGGCC' * 10]

generation_metrics = metrics.sequence_generation_metrics(generated, reference)

print("\nGeneration Metrics:")
for metric, value in generation_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 2. Performance Benchmarking

Comprehensive benchmarking suite:

In [None]:
class ModelBenchmark:
    """Comprehensive model benchmarking."""

    def __init__(self, model, tokenizer, device='cuda'):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.results = {}

    def inference_speed_test(self, sequence_lengths, batch_sizes, num_runs=10):
        """Test inference speed across different configurations."""
        results = []

        for seq_len in sequence_lengths:
            for batch_size in batch_sizes:
                # Generate dummy data
                dummy_input = torch.randint(0, 100, (batch_size, seq_len)).to(self.device)

                # Warmup
                for _ in range(3):
                    with torch.no_grad():
                        _ = self.model(dummy_input)

                # Actual timing
                times = []
                for _ in range(num_runs):
                    torch.cuda.synchronize() if self.device == 'cuda' else None
                    start_time = time.time()

                    with torch.no_grad():
                        _ = self.model(dummy_input)

                    torch.cuda.synchronize() if self.device == 'cuda' else None
                    end_time = time.time()
                    times.append(end_time - start_time)

                avg_time = np.mean(times)
                std_time = np.std(times)
                throughput = batch_size / avg_time

                results.append({
                    'sequence_length': seq_len,
                    'batch_size': batch_size,
                    'avg_time': avg_time,
                    'std_time': std_time,
                    'throughput': throughput
                })

        self.results['inference_speed'] = pd.DataFrame(results)
        return self.results['inference_speed']

    def memory_usage_test(self, sequence_lengths, batch_sizes):
        """Test memory usage across different configurations."""
        if self.device != 'cuda':
            print("Memory testing only available for CUDA")
            return None

        results = []

        for seq_len in sequence_lengths:
            for batch_size in batch_sizes:
                torch.cuda.empty_cache()
                torch.cuda.reset_peak_memory_stats()

                dummy_input = torch.randint(0, 100, (batch_size, seq_len)).to(self.device)

                # Forward pass
                output = self.model(dummy_input)

                current_memory = torch.cuda.memory_allocated() / 1024**3  # GB
                peak_memory = torch.cuda.max_memory_allocated() / 1024**3  # GB

                results.append({
                    'sequence_length': seq_len,
                    'batch_size': batch_size,
                    'current_memory_gb': current_memory,
                    'peak_memory_gb': peak_memory
                })

                del dummy_input, output

        self.results['memory_usage'] = pd.DataFrame(results)
        return self.results['memory_usage']

    def plot_benchmarks(self):
        """Plot benchmark results."""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        if 'inference_speed' in self.results:
            df = self.results['inference_speed']

            # Throughput heatmap
            pivot_throughput = df.pivot(index='sequence_length', columns='batch_size', values='throughput')
            sns.heatmap(pivot_throughput, annot=True, fmt='.1f', ax=axes[0,0], cmap='viridis')
            axes[0,0].set_title('Throughput (sequences/sec)')

            # Latency heatmap
            pivot_latency = df.pivot(index='sequence_length', columns='batch_size', values='avg_time')
            sns.heatmap(pivot_latency, annot=True, fmt='.3f', ax=axes[0,1], cmap='plasma')
            axes[0,1].set_title('Average Latency (seconds)')

        if 'memory_usage' in self.results:
            df = self.results['memory_usage']

            # Memory usage heatmap
            pivot_memory = df.pivot(index='sequence_length', columns='batch_size', values='peak_memory_gb')
            sns.heatmap(pivot_memory, annot=True, fmt='.2f', ax=axes[1,0], cmap='Reds')
            axes[1,0].set_title('Peak Memory Usage (GB)')

            # Memory efficiency (sequences per GB)
            df['memory_efficiency'] = (df['sequence_length'] * df['batch_size']) / df['peak_memory_gb']
            pivot_efficiency = df.pivot(index='sequence_length', columns='batch_size', values='memory_efficiency')
            sns.heatmap(pivot_efficiency, annot=True, fmt='.0f', ax=axes[1,1], cmap='Blues')
            axes[1,1].set_title('Memory Efficiency (tokens/GB)')

        plt.tight_layout()
        plt.show()

# Mock benchmark (since we don't have actual model)
print("🚀 Model Benchmarking Demo")
print("=" * 40)

# Simulate benchmark results
sequence_lengths = [512, 1024, 2048]
batch_sizes = [8, 16, 32]

# Create mock results
mock_results = []
for seq_len in sequence_lengths:
    for batch_size in batch_sizes:
        # Simulate realistic performance characteristics
        base_time = (seq_len / 1000) * (batch_size / 10) * 0.1
        avg_time = base_time + np.random.normal(0, base_time * 0.1)
        throughput = batch_size / avg_time
        memory_gb = (seq_len * batch_size * 4) / (1024**3) * 2  # Rough estimate

        mock_results.append({
            'sequence_length': seq_len,
            'batch_size': batch_size,
            'avg_time': avg_time,
            'throughput': throughput,
            'peak_memory_gb': memory_gb
        })

df_results = pd.DataFrame(mock_results)
print("Benchmark Results:")
print(df_results.round(3))

# Plot mock results
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Throughput
pivot_throughput = df_results.pivot(index='sequence_length', columns='batch_size', values='throughput')
sns.heatmap(pivot_throughput, annot=True, fmt='.1f', ax=axes[0], cmap='viridis')
axes[0].set_title('Throughput (seq/sec)')

# Latency
pivot_latency = df_results.pivot(index='sequence_length', columns='batch_size', values='avg_time')
sns.heatmap(pivot_latency, annot=True, fmt='.3f', ax=axes[1], cmap='plasma')
axes[1].set_title('Latency (seconds)')

# Memory
pivot_memory = df_results.pivot(index='sequence_length', columns='batch_size', values='peak_memory_gb')
sns.heatmap(pivot_memory, annot=True, fmt='.3f', ax=axes[2], cmap='Reds')
axes[2].set_title('Memory (GB)')

plt.tight_layout()
plt.show()

## 3. Statistical Analysis

Statistical significance testing and confidence intervals:

In [None]:
import warnings

from scipy import stats
from scipy.stats import bootstrap, mannwhitneyu, ttest_ind

warnings.filterwarnings('ignore')

class StatisticalAnalysis:
    """Statistical analysis tools for model evaluation."""

    @staticmethod
    def compare_models(model1_scores, model2_scores, metric_name="accuracy", alpha=0.05):
        """Compare two models statistically."""
        # Basic statistics
        mean1, std1 = np.mean(model1_scores), np.std(model1_scores)
        mean2, std2 = np.mean(model2_scores), np.std(model2_scores)

        # Effect size (Cohen's d)
        pooled_std = np.sqrt(((len(model1_scores) - 1) * std1**2 +
                              (len(model2_scores) - 1) * std2**2) /
                             (len(model1_scores) + len(model2_scores) - 2))
        cohens_d = (mean1 - mean2) / pooled_std

        # Normality tests
        _, p_norm1 = stats.shapiro(model1_scores)
        _, p_norm2 = stats.shapiro(model2_scores)

        # Choose appropriate test
        if p_norm1 > alpha and p_norm2 > alpha:
            # Both normal: use t-test
            stat, p_value = ttest_ind(model1_scores, model2_scores)
            test_used = "t-test"
        else:
            # Non-normal: use Mann-Whitney U
            stat, p_value = mannwhitneyu(model1_scores, model2_scores, alternative='two-sided')
            test_used = "Mann-Whitney U"

        # Confidence intervals
        ci1 = stats.t.interval(1-alpha, len(model1_scores)-1, mean1, stats.sem(model1_scores))
        ci2 = stats.t.interval(1-alpha, len(model2_scores)-1, mean2, stats.sem(model2_scores))

        return {
            'model1_mean': mean1,
            'model1_std': std1,
            'model1_ci': ci1,
            'model2_mean': mean2,
            'model2_std': std2,
            'model2_ci': ci2,
            'effect_size': cohens_d,
            'test_statistic': stat,
            'p_value': p_value,
            'test_used': test_used,
            'significant': p_value < alpha
        }

    @staticmethod
    def bootstrap_confidence_interval(data, statistic=np.mean, confidence_level=0.95, n_bootstrap=1000):
        """Calculate bootstrap confidence interval."""
        def bootstrap_statistic(x):
            return statistic(x)

        # Reshape data for scipy.stats.bootstrap
        data_reshaped = (data,)

        # Perform bootstrap
        result = bootstrap(data_reshaped, bootstrap_statistic,
                          n_resamples=n_bootstrap, confidence_level=confidence_level,
                          random_state=42)

        return {
            'statistic': statistic(data),
            'confidence_interval': result.confidence_interval,
            'confidence_level': confidence_level
        }

    @staticmethod
    def multiple_comparisons_correction(p_values, method='bonferroni'):
        """Apply multiple comparisons correction."""
        p_values = np.array(p_values)

        if method == 'bonferroni':
            corrected = p_values * len(p_values)
            corrected = np.minimum(corrected, 1.0)
        elif method == 'holm':
            # Holm's step-down method
            sorted_indices = np.argsort(p_values)
            sorted_p = p_values[sorted_indices]
            corrected = np.zeros_like(p_values)

            for i, idx in enumerate(sorted_indices):
                correction_factor = len(p_values) - i
                corrected[idx] = min(1.0, sorted_p[i] * correction_factor)
        else:
            raise ValueError(f"Unknown method: {method}")

        return corrected

# Example statistical analysis
print("📊 Statistical Analysis Demo")
print("=" * 40)

# Simulate model comparison data
np.random.seed(42)
model_a_scores = np.random.normal(0.85, 0.05, 30)  # Mean: 0.85, std: 0.05
model_b_scores = np.random.normal(0.88, 0.04, 30)  # Mean: 0.88, std: 0.04
model_c_scores = np.random.normal(0.87, 0.06, 30)  # Mean: 0.87, std: 0.06

stats_analyzer = StatisticalAnalysis()

# Compare Model A vs Model B
comparison_ab = stats_analyzer.compare_models(model_a_scores, model_b_scores, "F1-Score")

print("Model A vs Model B Comparison:")
print(f"  Model A: {comparison_ab['model1_mean']:.4f} ± {comparison_ab['model1_std']:.4f}")
print(f"  Model B: {comparison_ab['model2_mean']:.4f} ± {comparison_ab['model2_std']:.4f}")
print(f"  Effect Size (Cohen's d): {comparison_ab['effect_size']:.4f}")
print(f"  Test Used: {comparison_ab['test_used']}")
print(f"  P-value: {comparison_ab['p_value']:.6f}")
print(f"  Significant: {comparison_ab['significant']}")

# Bootstrap confidence intervals
bootstrap_a = stats_analyzer.bootstrap_confidence_interval(model_a_scores)
bootstrap_b = stats_analyzer.bootstrap_confidence_interval(model_b_scores)

print("\nBootstrap 95% CI:")
print(f"  Model A: [{bootstrap_a['confidence_interval'].low:.4f}, {bootstrap_a['confidence_interval'].high:.4f}]")
print(f"  Model B: [{bootstrap_b['confidence_interval'].low:.4f}, {bootstrap_b['confidence_interval'].high:.4f}]")

# Multiple comparisons
comparisons = [
    stats_analyzer.compare_models(model_a_scores, model_b_scores),
    stats_analyzer.compare_models(model_a_scores, model_c_scores),
    stats_analyzer.compare_models(model_b_scores, model_c_scores)
]

p_values = [comp['p_value'] for comp in comparisons]
corrected_p = stats_analyzer.multiple_comparisons_correction(p_values, 'bonferroni')

print("\nMultiple Comparisons (Bonferroni correction):")
comparison_names = ['A vs B', 'A vs C', 'B vs C']
for _i, (name, orig_p, corr_p) in enumerate(zip(comparison_names, p_values, corrected_p, strict=False)):
    print(f"  {name}: p={orig_p:.6f} → corrected p={corr_p:.6f}")

## 4. Evaluation Visualization

Comprehensive visualization of evaluation results:

In [None]:
class EvaluationVisualizer:
    """Visualization tools for model evaluation."""

    @staticmethod
    def plot_confusion_matrix(y_true, y_pred, class_names=None, normalize=False):
        """Plot confusion matrix."""
        cm = confusion_matrix(y_true, y_pred)

        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            fmt = '.2f'
        else:
            fmt = 'd'

        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues',
                    xticklabels=class_names, yticklabels=class_names)
        plt.title('Confusion Matrix' + (' (Normalized)' if normalize else ''))
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

    @staticmethod
    def plot_roc_curves(models_data, title="ROC Curves"):
        """Plot ROC curves for multiple models."""
        plt.figure(figsize=(10, 8))

        for model_name, (y_true, y_scores) in models_data.items():
            fpr, tpr, _ = stats.roc_curve(y_true, y_scores)
            auc = roc_auc_score(y_true, y_scores)
            plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.3f})', linewidth=2)

        plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(title)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

    @staticmethod
    def plot_precision_recall_curves(models_data, title="Precision-Recall Curves"):
        """Plot precision-recall curves for multiple models."""
        plt.figure(figsize=(10, 8))

        for model_name, (y_true, y_scores) in models_data.items():
            precision, recall, _ = stats.precision_recall_curve(y_true, y_scores)
            ap = average_precision_score(y_true, y_scores)
            plt.plot(recall, precision, label=f'{model_name} (AP = {ap:.3f})', linewidth=2)

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(title)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

    @staticmethod
    def plot_metric_comparison(models_metrics, metrics_to_plot=None):
        """Plot comparison of multiple metrics across models."""
        if metrics_to_plot is None:
            metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']

        df = pd.DataFrame(models_metrics).T
        df = df[metrics_to_plot]

        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        # Bar plot
        df.plot(kind='bar', ax=axes[0], width=0.8)
        axes[0].set_title('Model Performance Comparison')
        axes[0].set_ylabel('Score')
        axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        axes[0].set_ylim(0, 1)
        axes[0].grid(True, alpha=0.3)

        # Radar plot
        angles = np.linspace(0, 2*np.pi, len(metrics_to_plot), endpoint=False)
        angles = np.concatenate((angles, [angles[0]]))  # Complete the circle

        ax = axes[1]
        ax.set_theta_offset(np.pi / 2)
        ax.set_theta_direction(-1)
        ax = plt.subplot(122, projection='polar')

        for model_name, values in df.iterrows():
            values_list = values.tolist()
            values_list += [values_list[0]]  # Complete the circle
            ax.plot(angles, values_list, 'o-', linewidth=2, label=model_name)
            ax.fill(angles, values_list, alpha=0.25)

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(metrics_to_plot)
        ax.set_ylim(0, 1)
        ax.set_title('Model Performance Radar')
        ax.legend(bbox_to_anchor=(1.3, 1.0))

        plt.tight_layout()
        plt.show()

# Example visualizations
print("📈 Evaluation Visualizations")
print("=" * 40)

visualizer = EvaluationVisualizer()

# Simulate classification data
np.random.seed(42)
n_samples = 1000
y_true = np.random.choice([0, 1], size=n_samples, p=[0.6, 0.4])

# Simulate three models with different performance
models_data = {
    'Hyena-GLT': (y_true, np.random.beta(2, 3, n_samples)),
    'Transformer': (y_true, np.random.beta(1.8, 3.2, n_samples)),
    'CNN': (y_true, np.random.beta(1.5, 4, n_samples))
}

# Plot ROC curves
visualizer.plot_roc_curves(models_data, "Model Comparison: ROC Curves")

# Plot Precision-Recall curves
visualizer.plot_precision_recall_curves(models_data, "Model Comparison: Precision-Recall")

# Confusion matrix for best model
best_model_scores = models_data['Hyena-GLT'][1]
y_pred = (best_model_scores > 0.3).astype(int)
visualizer.plot_confusion_matrix(y_true, y_pred, ['Negative', 'Positive'], normalize=True)

# Model metrics comparison
models_metrics = {}
for model_name, (y_true_model, y_scores_model) in models_data.items():
    y_pred_model = (y_scores_model > 0.3).astype(int)
    metrics = GenomicMetrics.sequence_classification_metrics(y_true_model, y_pred_model, y_scores_model)
    models_metrics[model_name] = metrics

visualizer.plot_metric_comparison(models_metrics)

print("\n✅ All visualizations completed!")

## 5. Comprehensive Evaluation Report

Generate a complete evaluation report:

In [None]:
class EvaluationReport:
    """Generate comprehensive evaluation reports."""

    def __init__(self):
        self.report_data = {}

    def add_model_results(self, model_name, metrics, benchmark_results=None):
        """Add results for a model."""
        self.report_data[model_name] = {
            'metrics': metrics,
            'benchmark': benchmark_results
        }

    def generate_summary(self):
        """Generate evaluation summary."""
        summary = {
            'num_models': len(self.report_data),
            'best_model': None,
            'best_metric': 0,
            'metric_comparison': {},
            'recommendations': []
        }

        # Find best model (by F1 score)
        for model_name, data in self.report_data.items():
            f1_score = data['metrics'].get('f1', 0)
            if f1_score > summary['best_metric']:
                summary['best_metric'] = f1_score
                summary['best_model'] = model_name

        # Metric comparison
        for metric in ['accuracy', 'precision', 'recall', 'f1']:
            summary['metric_comparison'][metric] = {
                model: data['metrics'].get(metric, 0)
                for model, data in self.report_data.items()
            }

        # Generate recommendations
        if summary['best_model']:
            best_data = self.report_data[summary['best_model']]

            if best_data['metrics'].get('precision', 0) > 0.9:
                summary['recommendations'].append("High precision - good for critical applications")

            if best_data['metrics'].get('recall', 0) > 0.9:
                summary['recommendations'].append("High recall - good for comprehensive detection")

            if best_data['metrics'].get('f1', 0) > 0.85:
                summary['recommendations'].append("Balanced performance - suitable for production")
            else:
                summary['recommendations'].append("Consider further optimization or data augmentation")

        return summary

    def print_report(self):
        """Print comprehensive evaluation report."""
        print("🔍 COMPREHENSIVE EVALUATION REPORT")
        print("=" * 60)

        summary = self.generate_summary()

        print("\n📊 SUMMARY")
        print(f"  Models Evaluated: {summary['num_models']}")
        print(f"  Best Model: {summary['best_model']} (F1: {summary['best_metric']:.4f})")

        print("\n📈 DETAILED METRICS")
        for model_name, data in self.report_data.items():
            print(f"\n  {model_name.upper()}:")
            for metric, value in data['metrics'].items():
                print(f"    {metric.title()}: {value:.4f}")

        print("\n🎯 RECOMMENDATIONS")
        for i, rec in enumerate(summary['recommendations'], 1):
            print(f"  {i}. {rec}")

        # Performance ranking
        print("\n🏆 PERFORMANCE RANKING (by F1 Score)")
        ranked_models = sorted(
            self.report_data.items(),
            key=lambda x: x[1]['metrics'].get('f1', 0),
            reverse=True
        )

        for i, (model_name, data) in enumerate(ranked_models, 1):
            f1 = data['metrics'].get('f1', 0)
            acc = data['metrics'].get('accuracy', 0)
            print(f"  {i}. {model_name}: F1={f1:.4f}, Accuracy={acc:.4f}")

# Example evaluation report
print("📋 Evaluation Report Generation")
print("=" * 40)

# Create report with simulated data
report = EvaluationReport()

# Add model results
for model_name, (y_true_model, y_scores_model) in models_data.items():
    y_pred_model = (y_scores_model > 0.3).astype(int)
    metrics = GenomicMetrics.sequence_classification_metrics(y_true_model, y_pred_model, y_scores_model)
    report.add_model_results(model_name, metrics)

# Generate and print report
report.print_report()

print("\n✅ Evaluation complete! Ready for model deployment decision.")