In [None]:
"""
Testing & Evaluation System for Multi-Agent Architecture
This module provides comprehensive testing and evaluation capabilities for measuring
agent performance and improvement over time.
"""

import json
import time
import asyncio
import logging
from enum import Enum
from typing import Dict, List, Any, Optional, Set, Tuple
from datetime import datetime
from pydantic import BaseModel, Field
from collections import defaultdict

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("testing_evaluation")

# ============================================================================
# Data Models
# ============================================================================

class MetricType(str, Enum):
    ACCURACY = "accuracy"
    EFFICIENCY = "efficiency"
    LEARNING_RATE = "learning_rate"
    ERROR_RECOVERY = "error_recovery"
    CONTEXT_UTILIZATION = "context_utilization"
    PATTERN_RECOGNITION = "pattern_recognition"
    COLLABORATION = "collaboration"


class TestResult(BaseModel):
    """Result of a single test case execution"""
    test_id: str
    agent_id: str
    timestamp: datetime = Field(default_factory=datetime.now)
    success: bool
    execution_time: float
    metrics: Dict[str, float]
    command: str
    output: str
    thoughts: List[Dict[str, Any]] = []
    context_size: int
    errors: List[str] = []


class AgentMetricSnapshot(BaseModel):
    """Snapshot of all metrics for an agent at a point in time"""
    agent_id: str
    timestamp: datetime = Field(default_factory=datetime.now)
    metrics: Dict[str, float]
    recent_tests: List[str] = []
    improvement_rate: Dict[str, float] = {}
    

class TestScenario(BaseModel):
    """A scenario for testing agents with specific conditions"""
    scenario_id: str
    name: str
    description: str
    difficulty_level: int = 1
    required_skills: Set[str] = set()
    test_cases: List[Dict[str, Any]] = []
    evaluation_criteria: Dict[str, float] = {}


class LearningAssessment(BaseModel):
    """Assessment of an agent's learning progress over time"""
    agent_id: str
    start_date: datetime
    end_date: datetime
    metrics_trend: Dict[str, List[Tuple[datetime, float]]] = {}
    skill_improvements: Dict[str, float] = {}
    identified_patterns: int
    learning_efficiency: float
    strengths: List[str] = []
    improvement_areas: List[str] = []


# ============================================================================
# Test Runner
# ============================================================================

class TestRunner:
    """Executes test scenarios and captures results"""
    
    def __init__(self, db_connector=None):
        self.scenarios = {}
        self.results = []
        self.db = db_connector
        self.baseline_metrics = defaultdict(dict)
        
    async def register_scenario(self, scenario: TestScenario) -> None:
        """Register a new test scenario"""
        self.scenarios[scenario.scenario_id] = scenario
        logger.info(f"Registered scenario: {scenario.name}")
        
    async def run_test(self, scenario_id: str, agent, context=None) -> List[TestResult]:
        """Run all test cases in a scenario against an agent"""
        if scenario_id not in self.scenarios:
            raise ValueError(f"Unknown scenario ID: {scenario_id}")
            
        scenario = self.scenarios[scenario_id]
        results = []
        
        logger.info(f"Running scenario '{scenario.name}' on agent {agent.agent_id}")
        
        for test_case in scenario.test_cases:
            start_time = time.time()
            
            # Capture initial context size
            initial_context_size = len(json.dumps(agent.context)) if hasattr(agent, "context") else 0
            
            try:
                # Execute test against agent
                success, output, thoughts = await agent.execute_command(
                    test_case["command"], 
                    context=context
                )
                
                # Calculate metrics
                metrics = self._calculate_metrics(agent, test_case, success, output, thoughts)
                execution_time = time.time() - start_time
                
                # Create result
                result = TestResult(
                    test_id=test_case["id"],
                    agent_id=agent.agent_id,
                    success=success,
                    execution_time=execution_time,
                    metrics=metrics,
                    command=test_case["command"],
                    output=output,
                    thoughts=thoughts,
                    context_size=initial_context_size,
                    errors=[]
                )
                
            except Exception as e:
                execution_time = time.time() - start_time
                result = TestResult(
                    test_id=test_case["id"],
                    agent_id=agent.agent_id,
                    success=False,
                    execution_time=execution_time,
                    metrics={},
                    command=test_case["command"],
                    output="",
                    context_size=initial_context_size,
                    errors=[str(e)]
                )
                logger.error(f"Test failed: {str(e)}")
            
            results.append(result)
            self.results.append(result)
            
            # Store result if db connection available
            if self.db:
                await self.db.store_test_result(result)
                
        return results
    
    def _calculate_metrics(self, agent, test_case, success, output, thoughts) -> Dict[str, float]:
        """Calculate various performance metrics based on test execution"""
        metrics = {}
        
        # Basic success metric
        metrics[MetricType.ACCURACY] = 1.0 if success else 0.0
        
        # Efficiency - compare to expected execution time if available
        if "expected_time" in test_case:
            efficiency = test_case["expected_time"] / (time.time() - agent.start_time)
            metrics[MetricType.EFFICIENCY] = min(1.0, efficiency)  # Cap at 1.0
            
        # Context utilization - measure how effectively the agent uses context
        if hasattr(agent, "context") and agent.context:
            # Ratio of relevant context items used vs total context size
            relevant_items = sum(1 for thought in thoughts if thought.get("context_items_used", 0) > 0)
            total_items = len(agent.context)
            metrics[MetricType.CONTEXT_UTILIZATION] = relevant_items / max(1, total_items)
            
        # Pattern recognition - based on thought patterns identified
        pattern_count = sum(1 for thought in thoughts if "pattern" in thought)
        metrics[MetricType.PATTERN_RECOGNITION] = min(1.0, pattern_count / 5.0)  # Normalize to 0-1
        
        # Add more metric calculations as needed
        
        return metrics


# ============================================================================
# Evaluation System
# ============================================================================

class EvaluationSystem:
    """Analyzes test results and generates improvement recommendations"""
    
    def __init__(self, db_connector=None):
        self.db = db_connector
        self.baseline_metrics = {}
        self.snapshots = []
        
    async def create_agent_snapshot(self, agent_id: str, test_results: List[TestResult]) -> AgentMetricSnapshot:
        """Create a performance snapshot for an agent based on recent test results"""
        if not test_results:
            raise ValueError("No test results provided for snapshot creation")
            
        # Aggregate metrics across test results
        aggregated_metrics = defaultdict(list)
        for result in test_results:
            for metric_name, value in result.metrics.items():
                aggregated_metrics[metric_name].append(value)
        
        # Calculate average for each metric
        metrics = {
            metric_name: sum(values) / len(values) 
            for metric_name, values in aggregated_metrics.items()
        }
        
        # Get previous snapshot for improvement calculation
        previous_snapshot = None
        if self.db:
            previous_snapshot = await self.db.get_latest_snapshot(agent_id)
        
        # Calculate improvement rates if previous snapshot exists
        improvement_rate = {}
        if previous_snapshot:
            for metric_name, current_value in metrics.items():
                if metric_name in previous_snapshot.metrics:
                    previous_value = previous_snapshot.metrics[metric_name]
                    if previous_value > 0:
                        improvement_rate[metric_name] = (current_value - previous_value) / previous_value
        
        # Create snapshot
        snapshot = AgentMetricSnapshot(
            agent_id=agent_id,
            metrics=metrics,
            recent_tests=[result.test_id for result in test_results],
            improvement_rate=improvement_rate
        )
        
        # Store snapshot
        self.snapshots.append(snapshot)
        if self.db:
            await self.db.store_agent_snapshot(snapshot)
            
        return snapshot
        
    async def assess_learning(self, agent_id: str, start_date: datetime, end_date: datetime) -> LearningAssessment:
        """Assess learning progress for an agent over a time period"""
        if not self.db:
            raise ValueError("Database connection required for learning assessment")
            
        # Fetch all snapshots for the specified period
        snapshots = await self.db.get_agent_snapshots(
            agent_id, 
            start_date=start_date, 
            end_date=end_date
        )
        
        if not snapshots:
            raise ValueError(f"No snapshots found for agent {agent_id} in the specified period")
            
        # Calculate metric trends
        metrics_trend = defaultdict(list)
        for snapshot in snapshots:
            for metric_name, value in snapshot.metrics.items():
                metrics_trend[metric_name].append((snapshot.timestamp, value))
        
        # Calculate skill improvements
        first_snapshot = snapshots[0]
        last_snapshot = snapshots[-1]
        
        skill_improvements = {}
        for metric_name, end_value in last_snapshot.metrics.items():
            if metric_name in first_snapshot.metrics:
                start_value = first_snapshot.metrics[metric_name]
                if start_value > 0:
                    skill_improvements[metric_name] = end_value - start_value
        
        # Get pattern recognition data
        pattern_data = await self.db.get_agent_pattern_data(agent_id, start_date, end_date)
        identified_patterns = pattern_data.get("identified_patterns", 0)
        
        # Calculate learning efficiency
        # (Improvement per test case over time)
        total_tests = await self.db.count_agent_tests(agent_id, start_date, end_date)
        avg_improvement = sum(skill_improvements.values()) / max(1, len(skill_improvements))
        learning_efficiency = avg_improvement / max(1, total_tests)
        
        # Identify strengths and improvement areas
        strengths = []
        improvement_areas = []
        
        for metric_name, improvement in skill_improvements.items():
            if improvement > 0.1:  # Significant improvement
                strengths.append(metric_name)
            elif improvement < 0.05:  # Little improvement
                improvement_areas.append(metric_name)
        
        # Create assessment
        assessment = LearningAssessment(
            agent_id=agent_id,
            start_date=start_date,
            end_date=end_date,
            metrics_trend=dict(metrics_trend),
            skill_improvements=skill_improvements,
            identified_patterns=identified_patterns,
            learning_efficiency=learning_efficiency,
            strengths=strengths,
            improvement_areas=improvement_areas
        )
        
        return assessment
    
    async def generate_recommendations(self, assessment: LearningAssessment) -> Dict[str, Any]:
        """Generate improvement recommendations based on learning assessment"""
        recommendations = {
            "priority_skills": [],
            "learning_exercises": [],
            "pattern_focus": [],
            "collaboration_suggestions": []
        }
        
        # Prioritize improvement areas
        for area in assessment.improvement_areas:
            recommendations["priority_skills"].append({
                "skill": area,
                "current_level": assessment.skill_improvements.get(area, 0),
                "target_improvement": 0.2
            })
        
        # Generate learning exercises based on current skills
        for area in assessment.improvement_areas[:2]:  # Focus on top 2 areas
            exercises = await self._generate_exercises_for_skill(area)
            recommendations["learning_exercises"].extend(exercises)
        
        # Pattern recognition recommendations
        if MetricType.PATTERN_RECOGNITION in assessment.improvement_areas:
            recommendations["pattern_focus"] = [
                "Increase focus on identifying command output patterns",
                "Practice with more varied command sequences",
                "Implement pattern matching for interactive commands"
            ]
        
        # Collaboration recommendations
        if MetricType.COLLABORATION in assessment.improvement_areas:
            recommendations["collaboration_suggestions"] = [
                "Increase interaction frequency with specialists",
                "Implement proactive collaboration mode",
                "Practice information sharing with other programmer agents"
            ]
        
        return recommendations
    
    async def _generate_exercises_for_skill(self, skill_area: str) -> List[Dict[str, Any]]:
        """Generate specific learning exercises for a skill area"""
        exercises = []
        
        if skill_area == MetricType.PATTERN_RECOGNITION:
            exercises = [
                {
                    "name": "Pattern Identification Exercise",
                    "description": "Analyze 20 command outputs and identify recurring patterns",
                    "difficulty": "Medium",
                    "expected_improvement": 0.15
                },
                {
                    "name": "Pattern Categorization",
                    "description": "Categorize identified patterns into system states",
                    "difficulty": "Hard",
                    "expected_improvement": 0.2
                }
            ]
        elif skill_area == MetricType.ERROR_RECOVERY:
            exercises = [
                {
                    "name": "Error Recovery Simulation",
                    "description": "Practice recovering from 10 common error scenarios",
                    "difficulty": "Medium",
                    "expected_improvement": 0.18
                }
            ]
        # Add more skill-specific exercises
        
        return exercises


# ============================================================================
# Skill Development Tracking
# ============================================================================

class SkillTracker:
    """Tracks the development of specific skills over time"""
    
    def __init__(self, db_connector=None):
        self.db = db_connector
        self.skill_definitions = {}
        
    async def define_skill(self, skill_id: str, name: str, metrics: List[str], 
                           thresholds: Dict[str, float]) -> None:
        """Define a skill to track based on metrics and thresholds"""
        self.skill_definitions[skill_id] = {
            "name": name,
            "metrics": metrics,
            "thresholds": thresholds
        }
        
        if self.db:
            await self.db.store_skill_definition(skill_id, name, metrics, thresholds)
    
    async def assess_skill_level(self, agent_id: str, skill_id: str, 
                                 recent_results: List[TestResult]) -> float:
        """Assess current skill level for an agent"""
        if skill_id not in self.skill_definitions:
            raise ValueError(f"Unknown skill ID: {skill_id}")
            
        skill_def = self.skill_definitions[skill_id]
        
        # Extract relevant metrics from test results
        relevant_metrics = defaultdict(list)
        for result in recent_results:
            for metric in skill_def["metrics"]:
                if metric in result.metrics:
                    relevant_metrics[metric].append(result.metrics[metric])
        
        # Calculate average for each metric
        avg_metrics = {
            metric: sum(values) / len(values) 
            for metric, values in relevant_metrics.items() 
            if values
        }
        
        # Calculate overall skill level (0.0 to 1.0)
        # Weight each metric based on thresholds
        skill_level = 0.0
        total_weight = 0.0
        
        for metric, threshold in skill_def["thresholds"].items():
            if metric in avg_metrics:
                weight = 1.0
                total_weight += weight
                normalized_value = min(1.0, avg_metrics[metric] / threshold)
                skill_level += normalized_value * weight
        
        if total_weight > 0:
            skill_level /= total_weight
        
        # Store skill assessment
        if self.db:
            await self.db.store_skill_assessment(
                agent_id, skill_id, skill_level, datetime.now()
            )
            
        return skill_level
    
    async def get_skill_history(self, agent_id: str, skill_id: str) -> List[Tuple[datetime, float]]:
        """Get historical skill levels for an agent"""
        if not self.db:
            raise ValueError("Database connection required for skill history")
            
        return await self.db.get_skill_history(agent_id, skill_id)


# ============================================================================
# Progression System
# ============================================================================

class ProgressionSystem:
    """Manages agent progression through increasingly difficult test scenarios"""
    
    def __init__(self, test_runner: TestRunner, evaluation: EvaluationSystem, 
                 skill_tracker: SkillTracker):
        self.test_runner = test_runner
        self.evaluation = evaluation
        self.skill_tracker = skill_tracker
        self.progression_paths = {}
        self.agent_levels = defaultdict(int)
        
    async def define_progression_path(self, path_id: str, name: str, 
                                      levels: List[Dict[str, Any]]) -> None:
        """Define a progression path with multiple difficulty levels"""
        self.progression_paths[path_id] = {
            "name": name,
            "levels": levels
        }
        
    async def get_next_scenario(self, agent_id: str, path_id: str) -> Optional[str]:
        """Get the next appropriate test scenario for an agent based on current level"""
        if path_id not in self.progression_paths:
            raise ValueError(f"Unknown progression path: {path_id}")
            
        current_level = self.agent_levels.get(agent_id, 0)
        path = self.progression_paths[path_id]
        
        if current_level >= len(path["levels"]):
            return None  # Agent has completed all levels
            
        level_info = path["levels"][current_level]
        return level_info["scenario_id"]
    
    async def evaluate_advancement(self, agent_id: str, path_id: str, 
                                   test_results: List[TestResult]) -> bool:
        """Evaluate if an agent should advance to the next level"""
        if path_id not in self.progression_paths:
            raise ValueError(f"Unknown progression path: {path_id}")
            
        current_level = self.agent_levels.get(agent_id, 0)
        path = self.progression_paths[path_id]
        
        if current_level >= len(path["levels"]):
            return False  # Already at max level
            
        level_info = path["levels"][current_level]
        
        # Check advancement criteria
        success_rate = sum(1 for r in test_results if r.success) / max(1, len(test_results))
        
        # Check required skill levels
        skills_met = True
        for skill_id, required_level in level_info.get("required_skills", {}).items():
            current_skill = await self.skill_tracker.assess_skill_level(
                agent_id, skill_id, test_results
            )
            if current_skill < required_level:
                skills_met = False
                break
        
        # Determine if agent should advance
        should_advance = (
            success_rate >= level_info.get("required_success_rate", 0.8) and
            skills_met
        )
        
        if should_advance:
            self.agent_levels[agent_id] = current_level + 1
            
        return should_advance


# ============================================================================
# Example Usage
# ============================================================================

async def example_usage():
    """Example of how to use the testing and evaluation system"""
    
    # Create components
    test_runner = TestRunner()
    evaluation = EvaluationSystem()
    skill_tracker = SkillTracker()
    progression = ProgressionSystem(test_runner, evaluation, skill_tracker)
    
    # Define a test scenario
    basic_commands = TestScenario(
        scenario_id="basic_linux_commands",
        name="Basic Linux Commands",
        description="Tests basic Linux command execution and output processing",
        difficulty_level=1,
        required_skills={"command_execution", "output_processing"},
        test_cases=[
            {
                "id": "ls_test",
                "command": "ls -la",
                "expected_success": True,
                "expected_time": 0.5
            },
            {
                "id": "grep_test",
                "command": "grep 'error' /var/log/syslog",
                "expected_success": True,
                "expected_time": 1.0
            }
        ],
        evaluation_criteria={
            MetricType.ACCURACY: 0.8,
            MetricType.EFFICIENCY: 0.7
        }
    )
    
    # Register scenario
    await test_runner.register_scenario(basic_commands)
    
    # Define skills
    await skill_tracker.define_skill(
        "command_execution",
        "Linux Command Execution",
        [MetricType.ACCURACY, MetricType.EFFICIENCY],
        {MetricType.ACCURACY: 0.8, MetricType.EFFICIENCY: 0.7}
    )
    
    # Define progression path
    await progression.define_progression_path(
        "linux_mastery",
        "Linux Command Mastery",
        [
            {
                "level": 1,
                "scenario_id": "basic_linux_commands",
                "required_success_rate": 0.8,
                "required_skills": {"command_execution": 0.7}
            },
            {
                "level": 2,
                "scenario_id": "intermediate_linux_commands",
                "required_success_rate": 0.75,
                "required_skills": {"command_execution": 0.8, "output_processing": 0.7}
            }
        ]
    )
    
    # The rest would be implemented with actual agent instances


if __name__ == "__main__":
    asyncio.run(example_usage())