In [None]:
# PRE-STEP: Install Required Dependencies
%pip install langchain
%pip install langgraph
%pip install langchain-openai
%pip install chromadb
%pip install python-dotenv
%pip install pydantic

In [None]:
# Cell 1: Setup and load the trained agent state from Lab 18-1
import os
import json
import pickle
from datetime import datetime
import pandas as pd
import numpy as np
from typing import Dict, List
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from dotenv import load_dotenv

# Import the new architecture components
from coala_agent import CoALAAgent
from domain_investment.investment_advisor_agent import InvestmentAdvisorAgent
from domain_investment.investment_advisor_prompts import (
    PROMPT_MEMORY_OPTIMIZATION, GRADIENT_CRITIQUE, GRADIENT_PROPOSAL,
    METAPROMPT_SURFACE, METAPROMPT_DEEP, METAPROMPT_SYNTHESIS
)
from domain_investment.investor_test_scenarios import (
    run_prompt_memory_test,
    run_gradient_test,
    test_agents_with_queries,
    test_response_consistency,
    compare_agent_performance
)


from domain_agent import DomainProcedure

load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

In [None]:
# Cell 2: Define function to save checkpoints
#  Create checkpoint of current state
def save_checkpoint(agent: CoALAAgent, checkpoint_name: str):
    """Save agent state for comparison baseline"""
    # Get procedural memory stats
    proc_stats = agent.procedural_memory.get_stats() if hasattr(agent, 'procedural_memory') else {}
    
    checkpoint = {
        'timestamp': datetime.now().isoformat(),
        'episodic_count': len(agent.vector_store.get()["ids"]) if hasattr(agent.vector_store, 'get') else 0,
        'procedural_stats': proc_stats,
        'learned_strategies': list(agent.procedural_memory.global_procedures.keys()) if hasattr(agent, 'procedural_memory') else [],
        'current_performance': proc_stats.get('avg_success_rate', 0)
    }
    
    with open(f"checkpoints/{checkpoint_name}.pkl", "wb") as f:
        pickle.dump(checkpoint, f)
    
    return checkpoint

# Create checkpoint directory
os.makedirs("checkpoints", exist_ok=True)

print("🔄 Loading trained agent from Lab 18-1...")

# Create domain agent
domain_agent = InvestmentAdvisorAgent()

# IMPORTANT: Load the EXISTING agent from Lab 18-1, not create a new one
existing_memory_dir = os.path.join(domain_agent.domain_dir, "domain_memory_store")

if not os.path.exists(existing_memory_dir):
    print("⚠️  No existing memory store found from Lab 18-1!")
    print("   Please run Lab 18-1 first to create the trained agent.")
    raise FileNotFoundError(f"Memory store not found at {existing_memory_dir}")

# Load the existing trained agent (picking up from Lab 18-1)
baseline_agent = CoALAAgent(
    domain_agent=domain_agent,
    model_name="gpt-4.1-mini",  # Match Lab 18-1
    temperature=0,
    persist_directory=existing_memory_dir  # Use EXISTING memory from Lab 18-1
)

print(f"✅ Loaded existing agent from: {existing_memory_dir}")

# Save baseline checkpoint
baseline_checkpoint = save_checkpoint(baseline_agent, "baseline_lab18_2")

print("\n📊 Current State (from Lab 18-1):")
print(f"  Episodic memories: {baseline_checkpoint['episodic_count']}")
print(f"  Learned strategies: {len(baseline_checkpoint['learned_strategies'])}")
print(f"  Current performance: {baseline_checkpoint['current_performance']:.1%}")

In [None]:
# Cell 3: Show procedural memory breakdown by scope
# Show breakdown by scope
if hasattr(baseline_agent, 'procedural_memory'):
    stats = baseline_agent.procedural_memory.get_stats()
    print(f"\n  Strategy breakdown by scope:")
    for scope, count in stats['by_scope'].items():
        print(f"    {scope.capitalize()}: {count}")

# Load test data for comparison - use conversations NOT seen in Lab 18-1
data_dir = os.path.join(domain_agent.domain_dir, "investment_advisor_data")
test_conversations = []

if os.path.exists(f"{data_dir}/conversations.jsonl"):
    with open(f"{data_dir}/conversations.jsonl", "r") as f:
        for i, line in enumerate(f):
            # Lab 18-1 used conversations 0-100, so use 100-150 for testing
            if i >= 100 and i < 150:  
                test_conversations.append(json.loads(line))
else:
    print("⚠️  No conversation data found. Run Lab 18-1 first to generate data.")

if test_conversations:
    print(f"\n📁 Loaded {len(test_conversations)} NEW test conversations (not seen in Lab 18-1)")
    print(f"  Success rate in test set: {sum(1 for c in test_conversations if c['feedback']['success']) / len(test_conversations):.1%}")
    print(f"  Avg satisfaction: {sum(c['feedback']['satisfaction_score'] for c in test_conversations) / len(test_conversations):.1f}/5.0")

# Verify we have the trained model
if baseline_checkpoint['episodic_count'] == 0:
    print("\n⚠️  WARNING: Agent appears to be untrained (no memories found)")
    print("   Please complete Lab 18-1 first to train the agent.")
else:
    print(f"\n✅ Ready for optimization testing with {len(test_conversations)} new conversations")
    print("   We'll test 3 LangMem algorithms on this trained agent")

In [None]:
# Cell 4: Implement and test prompt_memory algorithm
class PromptMemoryOptimizer:
    """
    Prompt_memory: Single-pass optimization with minimal overhead.
    Key characteristics:
    - One LLM call for both analysis and synthesis
    - Lower computational cost
    - Faster adaptation cycles
    - Best for simpler patterns and quick iterations
    """
    
    def __init__(self, llm):
        self.llm = llm
        # Use the investment-specific optimization prompt
        self.optimization_prompt = PromptTemplate.from_template(
            PROMPT_MEMORY_OPTIMIZATION
        )
        self.parser = JsonOutputParser()
    
    def optimize(self, conversations: List[Dict], current_stats: Dict) -> Dict:
        """Single-pass optimization"""
        # Format conversations for analysis
        formatted_convs = []
        for conv in conversations[:10]:  # Limit for context window
            formatted_convs.append({
                "query": conv["messages"][0]["content"],
                "response": conv["messages"][1]["content"][:200],
                "success": conv["feedback"]["success"],
                "satisfaction": conv["feedback"]["satisfaction_score"]
            })
        
        # Single LLM call for optimization
        chain = self.optimization_prompt | self.llm | self.parser
        result = chain.invoke({
            "conversations": json.dumps(formatted_convs, indent=2),
            "current_performance": json.dumps(current_stats)
        })
        
        return {
            "algorithm": "prompt_memory",
            "patterns": result.get("patterns_found", []),
            "rules": result.get("procedural_rules", []),
            "summary": result.get("optimization_summary", ""),
            "llm_calls": 1  # Key efficiency metric
        }

# TESTING:
# Create new domain agent instance for testing
test_domain_agent = InvestmentAdvisorAgent()

# Initialize optimizer
prompt_optimizer = PromptMemoryOptimizer(
    llm=ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
)

# Run the complete test
test_results = run_prompt_memory_test(
    baseline_agent, 
    test_domain_agent, 
    prompt_optimizer, 
    test_conversations
)

if test_results:
    # Display results
    print(f"\n📊 Prompt_memory Results:")
    print(f"  Time taken: {test_results['optimization_time']:.2f} seconds")
    print(f"  LLM calls: {test_results['prompt_result']['llm_calls']}")
    print(f"  Patterns found: {len(test_results['prompt_result']['patterns'])}") 
    print(f"  Rules generated: {len(test_results['prompt_result']['rules'])}")
    
    if test_results['prompt_result']['rules']:
        print(f"\n  Sample rule: {test_results['prompt_result']['rules'][0]['rule']}")
        print(f"  Priority: {test_results['prompt_result']['rules'][0].get('priority', 'N/A')}")
    
    print(f"\n✅ Applied {test_results['applied_rules']} rules to agent")
    
    # Show efficiency results
    print("\n⏱️ Testing efficiency (3 runs)...")
    eff = test_results['efficiency']
    print(f"  Average optimization time: {eff['avg_time']:.2f} seconds")
    print(f"  Efficiency: {eff['efficiency']:.1f} optimizations/second")
    
    # Save checkpoint
    prompt_checkpoint = save_checkpoint(test_results['agent'], "prompt_memory_test")
    print(f"\n💾 Checkpoint saved: {len(prompt_checkpoint['learned_strategies'])} strategies")

In [None]:
# Cell 5: Implement and test gradient algorithm
class GradientOptimizer:
    """
    Gradient: Two-phase optimization with separated critique and proposal.
    Key characteristics:
    - Phase 1: Objective critique of current behavior
    - Phase 2: Targeted improvement proposals
    - More focused and actionable rules
    - Better for identifying specific failure modes
    """
    
    def __init__(self, llm):
        self.llm = llm
        
        # Use investment-specific gradient prompts
        self.critique_prompt = PromptTemplate.from_template(GRADIENT_CRITIQUE)
        self.proposal_prompt = PromptTemplate.from_template(GRADIENT_PROPOSAL)
        
        self.parser = JsonOutputParser()
    
    def optimize(self, conversations: List[Dict], current_stats: Dict) -> Dict:
        """Two-phase gradient optimization"""
        
        # Format conversations
        formatted_convs = []
        for conv in conversations[:10]:
            formatted_convs.append({
                "query": conv["messages"][0]["content"],
                "response": conv["messages"][1]["content"][:200],
                "success": conv["feedback"]["success"],
                "satisfaction": conv["feedback"]["satisfaction_score"],
                "signals": list(k for k, v in conv["behavioral_signals"].items() if v)
            })
        
        # Phase 1: Critique
        critique_chain = self.critique_prompt | self.llm | self.parser
        critique = critique_chain.invoke({
            "conversations": json.dumps(formatted_convs, indent=2)
        })
        
        # Phase 2: Proposal based on critique
        proposal_chain = self.proposal_prompt | self.llm | self.parser
        proposal = proposal_chain.invoke({
            "critique": json.dumps(critique, indent=2)
        })
        
        return {
            "algorithm": "gradient",
            "critique": critique,
            "improvements": proposal.get("improvements", []),
            "strategy": proposal.get("optimization_strategy", ""),
            "llm_calls": 2  # Two-phase approach
        }

# TESTING: 
# Create new domain agent instance for testing
gradient_domain_agent = InvestmentAdvisorAgent()

# Initialize gradient optimizer
gradient_optimizer = GradientOptimizer(
    llm=ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
)

# Run the complete test
test_results = run_gradient_test(
    baseline_agent,
    gradient_domain_agent,
    gradient_optimizer,
    test_conversations
)

if test_results:
    result = test_results['gradient_result']
    
    # Display optimization results
    print(f"\n📊 Gradient Results:")
    print(f"  Time taken: {test_results['optimization_time']:.2f} seconds")
    print(f"  LLM calls: {result['llm_calls']}")
    print(f"  Issues identified: {len(result['critique'].get('failures', []))}")
    print(f"  Successes noted: {len(result['critique'].get('successes', []))}")
    print(f"  Critical gaps: {len(result['critique'].get('critical_gaps', []))}")
    print(f"  Improvements generated: {len(result['improvements'])}")
    
    if result['improvements']:
        improvement = result['improvements'][0]
        print(f"\n  Sample improvement:")
        print(f"    Target: {improvement.get('target_issue', 'N/A')}")
        print(f"    Rule: {improvement.get('rule', 'N/A')}")
        print(f"    Impact: {improvement.get('expected_impact', 'N/A')}")
    
    print(f"\n✅ Applied {test_results['applied_improvements']} improvements to agent")
    
    # Display issue detection results
    print("\n🔍 Testing issue detection capability...")
    issue_detection = test_results['issue_detection']
    if issue_detection.get('critical_gaps'):
        print(f"  Identified gaps: {len(issue_detection['critical_gaps'])} issues")
        for gap in issue_detection['critical_gaps'][:1]:
            print(f"    • {gap.get('gap', 'Issue detected')}")
    
    # Save checkpoint
    gradient_checkpoint = save_checkpoint(test_results['agent'], "gradient_test")
    print(f"\n💾 Checkpoint saved: {len(gradient_checkpoint['learned_strategies'])} strategies")

In [None]:
# Cell 6: Implement and test metaprompt algorithm
class MetapromptOptimizer:
    """
    Metaprompt: Multi-stage analysis with reflection for deep pattern discovery.
    Key characteristics:
    - Multiple analysis stages with different perspectives
    - Reflection steps to question assumptions
    - Discovers non-obvious patterns and correlations
    - Best for complex domains requiring nuanced understanding
    """
    
    def __init__(self, llm):
        self.llm = llm
        
        # Use investment-specific metaprompt prompts
        self.surface_prompt = PromptTemplate.from_template(METAPROMPT_SURFACE)
        self.deep_prompt = PromptTemplate.from_template(METAPROMPT_DEEP)
        self.synthesis_prompt = PromptTemplate.from_template(METAPROMPT_SYNTHESIS)
        
        self.parser = JsonOutputParser()
    
    def optimize(self, conversations: List[Dict], current_stats: Dict) -> Dict:
        """Multi-stage metaprompt optimization with reflection"""
        
        # Format conversations with rich context
        formatted_convs = []
        for conv in conversations[:10]:
            formatted_convs.append({
                "query": conv["messages"][0]["content"],
                "response": conv["messages"][1]["content"][:200],
                "success": conv["feedback"]["success"],
                "satisfaction": conv["feedback"]["satisfaction_score"],
                "behavioral_signals": conv["behavioral_signals"],
                "metadata": conv["metadata"]
            })
        
        # Stage 1: Surface analysis
        surface_chain = self.surface_prompt | self.llm | self.parser
        surface_analysis = surface_chain.invoke({
            "conversations": json.dumps(formatted_convs, indent=2)
        })
        
        # Stage 2: Deep analysis with reflection
        deep_chain = self.deep_prompt | self.llm | self.parser
        deep_analysis = deep_chain.invoke({
            "surface_analysis": json.dumps(surface_analysis, indent=2),
            "conversations": json.dumps(formatted_convs, indent=2)
        })
        
        # Stage 3: Synthesis
        synthesis_chain = self.synthesis_prompt | self.llm | self.parser
        synthesis = synthesis_chain.invoke({
            "surface_analysis": json.dumps(surface_analysis, indent=2),
            "deep_analysis": json.dumps(deep_analysis, indent=2)
        })
        
        return {
            "algorithm": "metaprompt",
            "surface_patterns": surface_analysis.get("surface_patterns", []),
            "hidden_patterns": deep_analysis.get("hidden_patterns", []),
            "causal_relationships": deep_analysis.get("causal_relationships", []),
            "comprehensive_rules": synthesis.get("comprehensive_rules", []),
            "meta_insights": synthesis.get("meta_insights", ""),
            "llm_calls": 3  # Three-stage approach
        }
    
# TESTING:

from domain_investment.investor_test_scenarios import run_metaprompt_test

# Create new domain agent instance for testing
metaprompt_domain_agent = InvestmentAdvisorAgent()

# Initialize metaprompt optimizer
metaprompt_optimizer = MetapromptOptimizer(
    llm=ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
)

# Run the complete test
test_results = run_metaprompt_test(
    baseline_agent,
    metaprompt_domain_agent,
    metaprompt_optimizer,
    test_conversations
)

if test_results:
    result = test_results['metaprompt_result']
    
    # Display optimization results
    print(f"\n📊 Metaprompt Results:")
    print(f"  Time taken: {test_results['optimization_time']:.2f} seconds")
    print(f"  LLM calls: {result['llm_calls']}")
    print(f"  Surface patterns: {len(result['surface_patterns'])}")
    print(f"  Hidden patterns: {len(result['hidden_patterns'])}")
    print(f"  Causal relationships: {len(result['causal_relationships'])}")
    print(f"  Comprehensive rules: {len(result['comprehensive_rules'])}")
    
    if result['hidden_patterns']:
        hidden = result['hidden_patterns'][0]
        print(f"\n  Hidden pattern discovered:")
        print(f"    Pattern: {hidden.get('pattern', 'N/A')}")
        print(f"    Confidence: {hidden.get('confidence', 'N/A')}")
    
    if result['meta_insights']:
        print(f"\n  Meta insight: {result['meta_insights'][:100]}...")
    
    print(f"\n✅ Applied {test_results['applied_rules']} comprehensive rules to agent")
    
    # Display reflection test results
    print("\n🔍 Testing reflection capability...")
    reflection = test_results['reflection_test']
    
    if reflection['causal_relationships']:
        print(f"  Discovered causal relationship:")
        causal = reflection['causal_relationships'][0]
        print(f"    Cause: {causal.get('cause', 'N/A')}")
        print(f"    Effect: {causal.get('effect', 'N/A')}")
        print(f"    Mechanism: {causal.get('mechanism', 'N/A')}")
    
    # Save checkpoint
    metaprompt_checkpoint = save_checkpoint(test_results['agent'], "metaprompt_test")
    print(f"\n💾 Checkpoint saved: {len(metaprompt_checkpoint['learned_strategies'])} strategies")

In [None]:
# Cell 7: Run comprehensive comparison of all three algorithms
class AlgorithmComparison:
    """Compare optimization algorithms across multiple dimensions"""
    
    def __init__(self):
        self.results = {}
    
    def compare_algorithms(self, 
                          test_convs: List[Dict],
                          optimizers: Dict) -> Dict:
        """Run all optimizers and compare results"""
        
        comparison = {
            "timing": {},
            "quality": {},
            "efficiency": {}
        }
        
        for name, optimizer in optimizers.items():
            print(f"\n🔄 Testing {name}...")
            
            # Measure optimization
            start_time = datetime.now()
            result = optimizer.optimize(test_convs, {"avg_success_rate": 0.7})
            end_time = datetime.now()
            
            # Timing metrics
            comparison["timing"][name] = {
                "total_seconds": (end_time - start_time).total_seconds(),
                "llm_calls": result["llm_calls"],
                "seconds_per_call": (end_time - start_time).total_seconds() / result["llm_calls"]
            }
            
            # Quality metrics
            if name == "prompt_memory":
                rule_count = len(result.get("rules", []))
                pattern_count = len(result.get("patterns", []))
            elif name == "gradient":
                rule_count = len(result.get("improvements", []))
                pattern_count = len(result.get("critique", {}).get("successes", []))
            else:  # metaprompt
                rule_count = len(result.get("comprehensive_rules", []))
                pattern_count = len(result.get("hidden_patterns", [])) + len(result.get("surface_patterns", []))
            
            comparison["quality"][name] = {
                "rules_generated": rule_count,
                "patterns_found": pattern_count,
                "has_examples": name == "metaprompt",
                "has_causal_analysis": name in ["gradient", "metaprompt"]
            }
            
            # Efficiency score
            comparison["efficiency"][name] = {
                "patterns_per_second": pattern_count / comparison["timing"][name]["total_seconds"] if comparison["timing"][name]["total_seconds"] > 0 else 0,
                "rules_per_second": rule_count / comparison["timing"][name]["total_seconds"] if comparison["timing"][name]["total_seconds"] > 0 else 0,
                "cost_estimate": result["llm_calls"] * 0.002  # Rough cost estimate
            }
            
            self.results[name] = result
        
        return comparison

# TESTING:

# Verify procedural memory state
print("📊 Current procedural memory state:")
print(f"  Baseline procedures: {len(baseline_agent.procedural_memory.global_procedures)}")
print(f"  Prompt_memory procedures: {len(prompt_memory_agent.procedural_memory.global_procedures)}")
print(f"  Gradient procedures: {len(gradient_agent.procedural_memory.global_procedures)}")
print(f"  Metaprompt procedures: {len(metaprompt_agent.procedural_memory.global_procedures)}")

print("\n" + "="*60)
print("🧪 TESTING OPTIMIZED AGENTS")
print("="*60)

# Test all agents
agents_to_test = [
    ("baseline", baseline_agent),
    ("prompt_memory", prompt_memory_agent),
    ("gradient", gradient_agent),
    ("metaprompt", metaprompt_agent)
]

# Run tests
results_by_algo = test_agents_with_queries(agents_to_test)

# Generate comparison statistics
print("\n" + "="*60)
print("📊 RESPONSE QUALITY COMPARISON")
print("="*60)

stats = compare_agent_performance(results_by_algo)

for algo_name, algo_stats in stats.items():
    improvement = " (+improved)" if algo_stats.get("empathy_improvement", False) else ""
    
    print(f"\n{algo_name.upper()}:")
    print(f"  Empathy shown: {algo_stats['empathy_count']}/{algo_stats['total_queries']} responses{improvement}")
    print(f"  Specific data: {algo_stats['specifics_count']}/{algo_stats['total_queries']} responses")
    print(f"  Educational: {algo_stats['education_count']}/{algo_stats['total_queries']} responses")
    print(f"  Avg length: {algo_stats['avg_length']:.0f} characters")

# Test consistency
print("\n📈 CONSISTENCY TEST:")
consistency_results = test_response_consistency(agents_to_test)

print("Response consistency (3 runs):")
for algo, stats in consistency_results.items():
    print(f"  {algo}: mean={stats['mean_length']:.0f} chars, std={stats['std_length']:.1f}")

In [None]:
# Cell 8: Test each optimized agent with real scenarios
def apply_optimization_to_agent(agent: CoALAAgent, 
                               optimization_result: Dict) -> None:
    """Apply optimization results to agent's procedural memory"""
    
    if optimization_result["algorithm"] == "prompt_memory":
        # Apply prompt_memory rules
        for rule in optimization_result.get("rules", []):
            procedure = DomainProcedure(
                strategy_pattern=rule.get("rule", "Rule"),
                steps=[rule.get("rule", "Apply rule")],
                success_rate=0.8,
                segments=["general"],
                scope="global"
            )
            key = rule.get("rule", "rule")[:30] if rule.get("rule") else f"rule_{len(agent.procedural_memory.global_procedures)}"
            agent.procedural_memory.global_procedures[key] = procedure
    
    elif optimization_result["algorithm"] == "gradient":
        # Apply gradient improvements
        for improvement in optimization_result.get("improvements", []):
            procedure = DomainProcedure(
                strategy_pattern=improvement.get("rule", "Improvement"),
                steps=[improvement.get("rule", "Apply improvement"), 
                       improvement.get("implementation", "Apply when appropriate")],
                success_rate=0.85,
                segments=["general"],
                scope="global"
            )
            key = improvement.get("target_issue", "issue")[:30] if improvement.get("target_issue") else f"improvement_{len(agent.procedural_memory.global_procedures)}"
            agent.procedural_memory.global_procedures[key] = procedure
    
    else:  # metaprompt
        # Apply comprehensive rules with examples
        for rule in optimization_result.get("comprehensive_rules", []):
            procedure = DomainProcedure(
                strategy_pattern=rule.get("rule", "Comprehensive rule"),
                steps=[rule.get("rule", "Apply rule"), 
                       f"Example: {rule.get('example_application', 'N/A')}"],
                success_rate=rule.get("confidence", 0.9),
                segments=["general"],
                scope="global"
            )
            key = rule.get("rule", "rule")[:30] if rule.get("rule") else f"metarule_{len(agent.procedural_memory.global_procedures)}"
            agent.procedural_memory.global_procedures[key] = procedure

# TESTING:

# Verify procedural memory state
print("📊 Current procedural memory state:")
print(f"  Baseline procedures: {len(baseline_agent.procedural_memory.global_procedures)}")
print(f"  Prompt_memory procedures: {len(prompt_memory_agent.procedural_memory.global_procedures)}")
print(f"  Gradient procedures: {len(gradient_agent.procedural_memory.global_procedures)}")
print(f"  Metaprompt procedures: {len(metaprompt_agent.procedural_memory.global_procedures)}")

print("\n" + "="*60)
print("🧪 TESTING OPTIMIZED AGENTS")
print("="*60)

# Test all agents
agents_to_test = [
    ("baseline", baseline_agent),
    ("prompt_memory", prompt_memory_agent),
    ("gradient", gradient_agent),
    ("metaprompt", metaprompt_agent)
]

# Run tests
results_by_algo = test_agents_with_queries(agents_to_test)

# Generate comparison statistics
print("\n" + "="*60)
print("📊 RESPONSE QUALITY COMPARISON")
print("="*60)

stats = compare_agent_performance(results_by_algo)

for algo_name, algo_stats in stats.items():
    improvement = " (+improved)" if algo_stats.get("empathy_improvement", False) else ""
    
    print(f"\n{algo_name.upper()}:")
    print(f"  Empathy shown: {algo_stats['empathy_count']}/{algo_stats['total_queries']} responses{improvement}")
    print(f"  Specific data: {algo_stats['specifics_count']}/{algo_stats['total_queries']} responses")
    print(f"  Educational: {algo_stats['education_count']}/{algo_stats['total_queries']} responses")
    print(f"  Avg length: {algo_stats['avg_length']:.0f} characters")

# Test consistency
print("\n📈 CONSISTENCY TEST:")
consistency_results = test_response_consistency(agents_to_test)

print("Response consistency (3 runs):")
for algo, stats in consistency_results.items():
    print(f"  {algo}: mean={stats['mean_length']:.0f} chars, std={stats['std_length']:.1f}")

In [None]:
# BONUS Cell 9: Create decision framework for algorithm selection
class AlgorithmSelector:
    """Decision framework for selecting optimal LangMem algorithm"""
    
    def __init__(self, comparison_results: Dict):
        self.comparison = comparison_results
        self.recommendations = []
    
    def recommend_algorithm(self, 
                          requirements: Dict) -> Dict:
        """Recommend best algorithm based on requirements"""
        
        scores = {
            "prompt_memory": 0,
            "gradient": 0,
            "metaprompt": 0
        }
        
        reasoning = {
            "prompt_memory": [],
            "gradient": [],
            "metaprompt": []
        }
        
        # Speed requirement
        if requirements.get("max_latency_seconds", float('inf')) < 3:
            scores["prompt_memory"] += 3
            reasoning["prompt_memory"].append("Meets strict latency requirement")
        elif requirements.get("max_latency_seconds", float('inf')) < 5:
            scores["prompt_memory"] += 2
            scores["gradient"] += 2
            reasoning["gradient"].append("Acceptable latency")
        
        # Cost sensitivity
        if requirements.get("minimize_cost", False):
            scores["prompt_memory"] += 3
            reasoning["prompt_memory"].append("Lowest cost per optimization")
        
        # Quality requirements
        if requirements.get("needs_causal_analysis", False):
            scores["gradient"] += 2
            scores["metaprompt"] += 3
            reasoning["metaprompt"].append("Provides deep causal analysis")
        
        if requirements.get("needs_examples", False):
            scores["metaprompt"] += 3
            reasoning["metaprompt"].append("Generates detailed examples")
        
        # Domain complexity
        complexity = requirements.get("domain_complexity", "medium")
        if complexity == "simple":
            scores["prompt_memory"] += 2
            reasoning["prompt_memory"].append("Sufficient for simple domains")
        elif complexity == "complex":
            scores["metaprompt"] += 3
            reasoning["metaprompt"].append("Best for complex pattern discovery")
        else:  # medium
            scores["gradient"] += 2
            reasoning["gradient"].append("Balanced for medium complexity")
        
        # Real-time learning
        if requirements.get("real_time_learning", False):
            scores["prompt_memory"] += 3
            reasoning["prompt_memory"].append("Enables real-time adaptation")
        
        # Find best match
        best_algo = max(scores, key=scores.get)
        
        return {
            "recommended": best_algo,
            "scores": scores,
            "reasoning": reasoning[best_algo],
            "alternative": sorted(scores, key=scores.get, reverse=True)[1]
        }

# Only run if we have comparison results
if 'comparison_results' in locals():
    # Initialize selector
    selector = AlgorithmSelector(comparison_results)
    
    # Define different use cases
    use_cases = [
        {
            "name": "High-Frequency Trading Advisor",
            "requirements": {
                "max_latency_seconds": 2,
                "minimize_cost": True,
                "real_time_learning": True,
                "domain_complexity": "simple"
            }
        },
        {
            "name": "Retirement Planning Specialist",
            "requirements": {
                "max_latency_seconds": 10,
                "needs_causal_analysis": True,
                "needs_examples": True,
                "domain_complexity": "complex"
            }
        },
        {
            "name": "General Investment Assistant",
            "requirements": {
                "max_latency_seconds": 5,
                "minimize_cost": False,
                "needs_causal_analysis": True,
                "domain_complexity": "medium"
            }
        }
    ]
    
    print("\n" + "="*60)
    print("🎯 ALGORITHM RECOMMENDATIONS BY USE CASE")
    print("="*60)
    
    # Generate recommendations for each use case
    recommendations_summary = []
    
    for use_case in use_cases:
        recommendation = selector.recommend_algorithm(use_case["requirements"])
        
        print(f"\n📋 {use_case['name']}")
        print("-" * 40)
        print(f"  Recommended: {recommendation['recommended'].upper()}")
        print(f"  Alternative: {recommendation['alternative']}")
        print(f"  Reasoning:")
        for reason in recommendation['reasoning']:
            print(f"    • {reason}")
        
        print(f"\n  Algorithm scores:")
        for algo, score in sorted(recommendation['scores'].items(), key=lambda x: x[1], reverse=True):
            bar = "█" * score + "░" * (10 - score)
            print(f"    {algo:15} {bar} ({score})")
        
        recommendations_summary.append({
            "use_case": use_case["name"],
            "recommended": recommendation['recommended'],
            "score": recommendation['scores'][recommendation['recommended']]
        })
    
    # Create final summary matrix
    print("\n" + "="*60)
    print("📊 FINAL RECOMMENDATION MATRIX")
    print("="*60)
    
    matrix_df = pd.DataFrame([
        {
            "Algorithm": "prompt_memory",
            "Best For": "Real-time, High-frequency, Cost-sensitive",
            "Speed": "⚡ Fast",
            "Quality": "★★★☆☆",
            "Cost": "$",
            "When to Use": "Production systems with <3s latency requirements"
        },
        {
            "Algorithm": "gradient",
            "Best For": "Balanced optimization, Targeted improvements",
            "Speed": "⚡⚡ Medium",
            "Quality": "★★★★☆",
            "Cost": "$$",
            "When to Use": "Most general-purpose applications"
        },
        {
            "Algorithm": "metaprompt",
            "Best For": "Complex domains, Deep insights, Research",
            "Speed": "⚡⚡⚡ Slow",
            "Quality": "★★★★★",
            "Cost": "$$$",
            "When to Use": "Complex advisory, Offline optimization, Quality critical"
        }
    ])
    
    print("\n" + matrix_df.to_string(index=False))
    
    # Performance-based recommendation
    print("\n" + "="*60)
    print("💡 PERFORMANCE-BASED RECOMMENDATIONS")
    print("="*60)
    
    if 'timing_df' in locals() and 'quality_df' in locals():
        # Find fastest algorithm
        fastest = timing_df['total_seconds'].idxmin()
        print(f"\n  ⚡ Fastest: {fastest}")
        print(f"     Average time: {timing_df.loc[fastest, 'total_seconds']:.2f}s")
        
        # Find highest quality
        if not quality_df.empty:
            quality_scores = quality_df['rules_generated'] + quality_df['patterns_found']
            best_quality = quality_scores.idxmax()
            print(f"\n  ⭐ Highest Quality: {best_quality}")
            print(f"     Rules + Patterns: {quality_scores[best_quality]:.0f}")
        
        # Find best value (quality/time ratio)
        if 'efficiency_df' in locals() and not efficiency_df.empty:
            best_efficiency = efficiency_df['patterns_per_second'].idxmax()
            print(f"\n  💰 Best Value: {best_efficiency}")
            print(f"     Patterns/second: {efficiency_df.loc[best_efficiency, 'patterns_per_second']:.2f}")
    
    print("\n✅ Decision framework complete")
else:
    print("⚠️ No comparison results available. Run Cell 5 first to generate comparison data.")

In [None]:
# BONUS Cell 10: Implement dynamic algorithm switching
class DynamicOptimizer:
    """
    Dynamically select optimization algorithm based on context.
    Demonstrates production pattern for adaptive learning.
    """
    
    def __init__(self):
        self.optimizers = {
            "prompt_memory": prompt_optimizer,
            "gradient": gradient_optimizer,
            "metaprompt": metaprompt_optimizer
        }
        self.usage_stats = {algo: {"calls": 0, "total_time": 0} for algo in self.optimizers}
    
    def select_optimizer(self, context: Dict) -> tuple:
        """Select best optimizer for current context"""
        
        # Decision logic based on context
        if context.get("user_waiting", False):
            # User is waiting for response - use fastest
            return "prompt_memory", self.optimizers["prompt_memory"]
        
        elif context.get("optimization_type") == "scheduled":
            # Scheduled batch optimization - use best quality
            return "metaprompt", self.optimizers["metaprompt"]
        
        elif context.get("failure_rate", 0) > 0.3:
            # High failure rate - need targeted improvements
            return "gradient", self.optimizers["gradient"]
        
        else:
            # Default to balanced approach
            return "gradient", self.optimizers["gradient"]
    
    def optimize_with_context(self, 
                             conversations: List[Dict],
                             context: Dict) -> Dict:
        """Optimize using context-appropriate algorithm"""
        
        algo_name, optimizer = self.select_optimizer(context)
        
        print(f"Selected {algo_name} based on context:")
        for key, value in context.items():
            print(f"   - {key}: {value}")
        
        # Run optimization
        start_time = datetime.now()
        result = optimizer.optimize(conversations, {"avg_success_rate": 0.7})
        elapsed = (datetime.now() - start_time).total_seconds()
        
        # Track usage
        self.usage_stats[algo_name]["calls"] += 1
        self.usage_stats[algo_name]["total_time"] += elapsed
        
        result["selected_algorithm"] = algo_name
        result["selection_reason"] = context
        
        return result

# Only run if optimizers exist
if test_conversations and 'prompt_optimizer' in locals() and 'gradient_optimizer' in locals() and 'metaprompt_optimizer' in locals():
    # Test dynamic optimization
    dynamic_opt = DynamicOptimizer()
    
    # Different contexts requiring different algorithms
    contexts = [
        {
            "scenario": "User asking question in chat",
            "context": {"user_waiting": True, "response_needed": "immediate"}
        },
        {
            "scenario": "Nightly batch optimization",
            "context": {"optimization_type": "scheduled", "time_constraint": "none"}
        },
        {
            "scenario": "High failure rate detected",
            "context": {"failure_rate": 0.4, "urgent": True}
        }
    ]
    
    print("\n" + "="*60)
    print("DYNAMIC ALGORITHM SELECTION DEMO")
    print("="*60)
    
    for ctx in contexts:
        print(f"\nScenario: {ctx['scenario']}")
        result = dynamic_opt.optimize_with_context(
            test_conversations[:5],
            ctx['context']
        )
        print(f"Optimization complete using {result['selected_algorithm']}")
        print(f"   LLM calls: {result['llm_calls']}")
        if result.get('rules'):
            print(f"   Rules generated: {len(result['rules'])}")
        elif result.get('improvements'):
            print(f"   Improvements: {len(result['improvements'])}")
        elif result.get('comprehensive_rules'):
            print(f"   Comprehensive rules: {len(result['comprehensive_rules'])}")
    
    # Display usage statistics
    print("\n" + "="*60)
    print("ALGORITHM USAGE STATISTICS")
    print("="*60)
    for algo, stats in dynamic_opt.usage_stats.items():
        if stats["calls"] > 0:
            avg_time = stats["total_time"] / stats["calls"]
            print(f"  {algo:15} - Calls: {stats['calls']}, Avg Time: {avg_time:.2f}s")
else:
    print("Prerequisites not available. Run cells 2-4 first to create optimizers.")

# Final summary and best practices
print("\n" + "="*60)
print("KEY TAKEAWAYS AND BEST PRACTICES")
print("="*60)

best_practices = """
1. ALGORITHM SELECTION GUIDELINES:
   - prompt_memory: Production systems, real-time learning, cost-sensitive
   - gradient: General purpose, balanced performance, failure analysis
   - metaprompt: Research, complex domains, quality-critical applications

2. HYBRID APPROACHES:
   - Use prompt_memory for immediate updates
   - Run gradient weekly for targeted improvements
   - Apply metaprompt monthly for deep insights

3. OPTIMIZATION TRIGGERS:
   - Conversation volume threshold (e.g., every 100 conversations)
   - Performance degradation (success rate drops below threshold)
   - Scheduled intervals (hourly, daily, weekly)
   - User feedback signals (low satisfaction scores)

4. PRODUCTION PATTERNS:
   - Start with prompt_memory for MVP/prototype
   - Graduate to gradient as system matures
   - Use metaprompt for periodic deep analysis
   - Implement dynamic selection for optimal results

5. MONITORING AND EVALUATION:
   - Track optimization frequency and cost
   - Measure performance improvement over time
   - A/B test different algorithms
   - Monitor user satisfaction changes
"""

print(best_practices)

# Save final comparison report
if 'comparison_results' in locals():
    report_path = "optimization_comparison_report.json"
    with open(report_path, 'w') as f:
        # Convert numpy types to native Python types for JSON serialization
        clean_results = {}
        for key, value in comparison_results.items():
            if isinstance(value, dict):
                clean_results[key] = {}
                for k, v in value.items():
                    if isinstance(v, dict):
                        clean_results[key][k] = {
                            subk: float(subv) if isinstance(subv, np.number) else subv
                            for subk, subv in v.items()
                        }
                    else:
                        clean_results[key][k] = float(v) if isinstance(v, np.number) else v
            else:
                clean_results[key] = value
        
        json.dump(clean_results, f, indent=2)
    print(f"\nComparison report saved to: {report_path}")