In [1]:
%pip install --upgrade --quiet openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import os
from PIL import Image
from IPython.display import HTML, Image, Markdown, display

In [4]:
import json
import time
import re
from typing import Dict, List, Any, Set, Optional
import PIL.Image
from pydantic import BaseModel, ValidationError
import asyncio
import aiohttp
import base64
from openai import OpenAI


In [None]:
os.environ["OPENAI_API_KEY"] = 'add_secret_key'

In [6]:
MODEL_ID = "o3-2025-04-16"

The last thing we need to configure is to fill in the model ID. Currently the latest version of APS April 17 official name is as in cell below. All model names can be checked in the model card on Kaggle.

In [7]:
import json
import pandas as pd
def load_public_data():
    # with open('///mnt/c/Personal/Competitions/ICML_Track2/input/mini.json', 'r') as file:
    with open('///mnt/c/Personal/Competitions/ICML_Track2/input/starting_kit_latest/total.json', 'r') as file:
        data = json.load(file)
    data = pd.DataFrame(data)
    problems = data.to_dict('records')
    return problems
problems = load_public_data()
# problems

In [8]:
# !pip install langgraph 
# !pip install langchain
# !pip install langchain-core
# !pip install langchain-openai

In [9]:
import os
import json
import time
import base64
import re
import logging
from typing import Dict, List, Any, Optional, TypedDict, Annotated, Union, Set, Tuple
from enum import Enum
from dataclasses import dataclass
from tqdm.notebook import tqdm

from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.runnables import RunnableConfig
import operator

In [10]:
# Setup logging
def setup_logging(log_file: str = "total_physics_solver.log", verbose: bool = True):
    """Setup logging configuration"""
    os.makedirs(os.path.dirname(log_file) if os.path.dirname(log_file) else '.', exist_ok=True)
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler() if verbose else logging.NullHandler()
        ],
        force=True
    )
    return logging.getLogger(__name__)

In [11]:
# Physics Domain Definitions (same as before)
PHYSICS_SUBJECTS = {
    'O': 'Optics (Basic)',
    'OPT': 'Optics (Extended/Advanced)', 
    'EM': 'Electromagnetism',
    'CM': 'Classical Mechanics',
    'TSM': 'Thermodynamics & Statistical Mechanics',
    'QMIT': 'Quantum Mechanics & Information Theory',
    'ACG': 'Astrophysics, Cosmology & Gravitation',
    'AMONP': 'Atomic, Molecular, Optical & Nuclear Physics'
}

PHYSICS_CATEGORIES = {
    'MECHANICS': [
        'static_force_analysis', 'spring_force', 'circular_motion', 
        'linear_motion', 'coordinate_system', 'simple_harmonic_motion', 
        'projectile_motion'
    ],
    'ELECTROMAGNETISM': [
        'circuit_diagram', 'charge_distribution', 'magnetic_circuit', 
        'electromagnetic_field', 'capacitance_resistance'
    ],
    'WAVES_OPTICS': [
        'optical_path', 'wave_motion', 'photoelectric_effect', 'acoustics'
    ],
    'THERMAL': ['thermodynamics'],
    'MODERN': [
        'atomic_physics', 'quantum_mechanics', 'relativity_gravity', 
        'feynman_diagram', 'astrophysics'
    ]
}

CATEGORY_EXPERTISE = {
    'static_force_analysis': {
        'en': 'Force equilibrium, vector analysis, torque calculations, structural mechanics, friction, constraint forces',
        'zh': '力平衡，矢量分析，力矩计算，结构力学，摩擦力，约束力'
    },
    'spring_force': {
        'en': 'Hooke\'s law, elastic energy, harmonic oscillations, coupled systems, resonance',
        'zh': '胡克定律，弹性能，谐振荡，耦合系统，共振'
    },
    'circular_motion': {
        'en': 'Centripetal force, angular momentum, rotational dynamics, orbital mechanics',
        'zh': '向心力，角动量，转动动力学，轨道力学'
    },
    'linear_motion': {
        'en': 'Kinematics, dynamics, momentum conservation, collision analysis',
        'zh': '运动学，动力学，动量守恒，碰撞分析'
    },
    'coordinate_system': {
        'en': 'Graph analysis, data interpretation, coordinate transformations, vector fields',
        'zh': '图形分析，数据解释，坐标变换，矢量场'
    },
    'simple_harmonic_motion': {
        'en': 'Oscillation equations, period analysis, energy methods, damping effects',
        'zh': '振动方程，周期分析，能量方法，阻尼效应'
    },
    'projectile_motion': {
        'en': 'Parabolic trajectories, range calculations, angle optimization, air resistance',
        'zh': '抛物轨迹，射程计算，角度优化，空气阻力'
    },
    'circuit_diagram': {
        'en': 'Ohm\'s law, Kirchhoff\'s laws, impedance analysis, AC/DC circuits, network analysis',
        'zh': '欧姆定律，基尔霍夫定律，阻抗分析，交直流电路，网络分析'
    },
    'charge_distribution': {
        'en': 'Electric fields, Gauss\'s law, potential calculations, boundary conditions',
        'zh': '电场，高斯定律，电势计算，边界条件'
    },
    'magnetic_circuit': {
        'en': 'Magnetic flux, inductance, transformer principles, magnetic coupling',
        'zh': '磁通量，电感，变压器原理，磁耦合'
    },
    'electromagnetic_field': {
        'en': 'Maxwell equations, wave propagation, Lorentz force, field interactions',
        'zh': '麦克斯韦方程，波传播，洛伦兹力，场相互作用'
    },
    'capacitance_resistance': {
        'en': 'Dielectric properties, resistance networks, Hall effect, field distributions',
        'zh': '介电性质，电阻网络，霍尔效应，场分布'
    },
    'optical_path': {
        'en': 'Ray tracing, lens systems, interference, diffraction, polarization',
        'zh': '光线追迹，透镜系统，干涉，衍射，偏振'
    },
    'wave_motion': {
        'en': 'Wave equations, superposition, standing waves, Doppler effect',
        'zh': '波动方程，叠加原理，驻波，多普勒效应'
    },
    'photoelectric_effect': {
        'en': 'Einstein equation, photon energy, quantum nature of light, work function',
        'zh': '爱因斯坦方程，光子能量，光的量子性，功函数'
    },
    'acoustics': {
        'en': 'Sound propagation, acoustic resonance, sound intensity, echo analysis',
        'zh': '声传播，声共振，声强，回声分析'
    },
    'thermodynamics': {
        'en': 'Laws of thermodynamics, heat transfer, phase transitions, statistical mechanics',
        'zh': '热力学定律，传热，相变，统计力学'
    },
    'atomic_physics': {
        'en': 'Nuclear structure, radioactive decay, particle interactions, cross-sections',
        'zh': '核结构，放射性衰变，粒子相互作用，截面'
    },
    'quantum_mechanics': {
        'en': 'Schrödinger equation, wave functions, quantum states, uncertainty principle',
        'zh': '薛定谔方程，波函数，量子态，不确定性原理'
    },
    'relativity_gravity': {
        'en': 'Special/general relativity, spacetime, gravitational effects, reference frames',
        'zh': '狭义/广义相对论，时空，引力效应，参考系'
    },
    'feynman_diagram': {
        'en': 'Particle interactions, conservation laws, quantum field theory, decay processes',
        'zh': '粒子相互作用，守恒定律，量子场论，衰变过程'
    },
    'astrophysics': {
        'en': 'Stellar physics, cosmology, orbital mechanics, astronomical observations',
        'zh': '恒星物理，宇宙学，轨道力学，天文观测'
    }
}

SUBJECT_GUIDANCE = {
    'O': {
        'en': 'Focus on basic geometric optics, ray tracing, and fundamental optical phenomena',
        'zh': '专注于基础几何光学、光线追踪和基本光学现象'
    },
    'OPT': {
        'en': 'Apply advanced optics: wave optics, interference, diffraction, quantum optics',
        'zh': '应用高级光学：波动光学、干涉、衍射、量子光学'
    },
    'EM': {
        'en': 'Emphasize electromagnetic fields, circuits, Maxwell equations, wave propagation',
        'zh': '强调电磁场、电路、麦克斯韦方程、波传播'
    },
    'CM': {
        'en': 'Focus on classical mechanics: forces, motion, energy, momentum conservation',
        'zh': '专注于经典力学：力、运动、能量、动量守恒'
    },
    'TSM': {
        'en': 'Apply thermodynamic laws, statistical mechanics, heat transfer principles',
        'zh': '应用热力学定律、统计力学、传热原理'
    },
    'QMIT': {
        'en': 'Use quantum mechanics, wave-particle duality, quantum information theory',
        'zh': '使用量子力学、波粒二象性、量子信息理论'
    },
    'ACG': {
        'en': 'Apply gravitational physics, cosmology, relativity, astronomical principles',
        'zh': '应用引力物理、宇宙学、相对论、天文学原理'
    },
    'AMONP': {
        'en': 'Focus on atomic structure, nuclear physics, particle interactions',
        'zh': '专注于原子结构、核物理、粒子相互作用'
    }
}


In [12]:
class PipelineSource(str, Enum):
    OPENAI = "openai"
    GEMINI = "gemini"
    SYNTHESIS = "synthesis"

class CorrectnessDecision(str, Enum):
    CORRECT = "correct"
    INCORRECT = "incorrect"

class SelectionDecision(str, Enum):
    CHOOSE_OPENAI = "choose_openai"
    CHOOSE_GEMINI = "choose_gemini"
    SYNTHESIZE = "synthesize"

class SingleEvaluationResult(BaseModel):
    """Evaluation result for a single prediction"""
    pipeline_source: PipelineSource
    is_correct: CorrectnessDecision
    confidence_assessment: float = Field(ge=0, le=1)
    physics_accuracy_score: float = Field(ge=0, le=1)
    mathematical_accuracy_score: float = Field(ge=0, le=1)
    reasoning_quality_score: float = Field(ge=0, le=1)
    specific_errors: List[str] = Field(default_factory=list)
    strengths: List[str] = Field(default_factory=list)
    detailed_feedback: str
    class Config:
        extra = "forbid"

class ComparisonEvaluationResult(BaseModel):
    """Combined evaluation result for both pipelines"""
    openai_evaluation: SingleEvaluationResult
    gemini_evaluation: SingleEvaluationResult
    overall_comparison: str
    recommendation_rationale: str
    class Config:
        extra = "forbid"

class SelectionResult(BaseModel):
    """Final selection decision with audit trail"""
    decision: SelectionDecision
    selected_prediction: Optional[str] = None
    selected_source: Optional[PipelineSource] = None
    decision_rationale: str
    openai_score: float = 0.0
    gemini_score: float = 0.0
    synthesis_required: bool = False
    audit_trail: Dict[str, Any] = Field(default_factory=dict)
    class Config:
        extra = "forbid"

class SynthesisResult(BaseModel):
    """Result from synthesis generation"""
    synthesized_prediction: str
    synthesis_confidence: float = Field(ge=0, le=1)
    synthesis_quality: float = Field(ge=0, le=1)
    context_used: Optional[Dict[str, str]] = None
    generation_notes: str
    class Config:
        extra = "forbid"  # Required for OpenAI structured output
        
# Additional Pydantic models for better structure
class ProblemData(BaseModel):
    """Structured problem data"""
    index: int
    question: str
    subject: str = ""
    image_path: List[str] = Field(default_factory=list)
    img_category: str = ""
    vision_relevance: str = ""
    language: str = "English"
    level: int = 1
    sig_figs: str = ""
    caption: str = ""

class PipelineResult(BaseModel):
    """Structured pipeline result"""
    prediction: str
    confidence_score: float = 0.5
    quality_score: float = 0.5
    iterations_used: int = 0
    total_generations: int = 0
    generation_history: List[str] = Field(default_factory=list)
    final_decision: str = "unknown"
    evaluation_feedback: str = ""

class ComparisonMetadata(BaseModel):
    """Metadata about the comparison process"""
    decision: str
    decision_rationale: str
    openai_score: float
    gemini_score: float
    synthesis_required: bool
    audit_trail: Dict[str, Any] = Field(default_factory=dict)
    evaluation_summary: str

class PipelineComparison(BaseModel):
    """Comparison data for pipeline results"""
    prediction: str
    confidence: float
    quality: float
    correctness: str

class FinalResult(BaseModel):
    """Final structured result with complete audit trail"""
    # Original problem data
    index: int
    question: str
    subject: str = ""
    image_path: List[str] = Field(default_factory=list)
    img_category: str = ""
    vision_relevance: str = ""
    language: str = "English"
    level: int = 1
    sig_figs: str = ""
    caption: str = ""
    
    # Final selected result
    prediction: str
    source: str
    confidence_score: float
    quality_score: float
    generation_notes: Optional[str] = None
    
    # Comparison metadata
    comparison_metadata: ComparisonMetadata
    pipeline_results: Dict[str, PipelineComparison] = Field(default_factory=dict)
    
    # Error handling
    error: Optional[str] = None
    comparison_error: Optional[str] = None

class ComparisonState(TypedDict):
    """State for the comparison pipeline"""
    problem: ProblemData
    openai_result: PipelineResult
    gemini_result: PipelineResult
    evaluation_result: Optional[ComparisonEvaluationResult]
    selection_result: Optional[SelectionResult]
    synthesis_result: Optional[SynthesisResult]
    final_result: Optional[FinalResult]

class MultiPipelineComparator:
    """
    Compares results from OpenAI and Gemini pipelines and synthesizes when needed
    Uses different models for evaluation vs synthesis for cost optimization
    """
    
    def __init__(self, api_key: str, 
                 evaluator_model: str = "o3-mini-2025-01-31", 
                 generator_model: str = "o3-2025-04-16",
                 images_base_path: str = "", 
                 logger: Optional[logging.Logger] = None):
        # Separate clients for evaluation and generation
        self.evaluator_client = ChatOpenAI(api_key=api_key, model=evaluator_model)
        self.generator_client = ChatOpenAI(api_key=api_key, model=generator_model)
        self.api_key = api_key
        self.evaluator_model = evaluator_model
        self.generator_model = generator_model
        self.images_base_path = images_base_path
        self.logger = logger or logging.getLogger(__name__)
    
    def _get_subject_context(self, subject_code: str, language: str = "en") -> tuple:
        """Get subject context and guidance"""
        if subject_code and subject_code in PHYSICS_SUBJECTS:
            subject_name = PHYSICS_SUBJECTS[subject_code]
            subject_context = f"{subject_code} ({subject_name})"
            subject_guidance = SUBJECT_GUIDANCE.get(subject_code, {}).get(language, "Apply general physics principles")
        else:
            if language == "zh":
                subject_context = "未指定学科"
                subject_guidance = "应用一般物理原理"
            else:
                subject_context = "Unspecified subject"
                subject_guidance = "Apply general physics principles"
        
        return subject_context, subject_guidance
    
    def _parse_problem_data(self, raw_problem: Dict[str, Any]) -> ProblemData:
        """Parse raw problem data into Pydantic model"""
        try:
            return ProblemData(**raw_problem)
        except Exception as e:
            self.logger.warning(f"⚠️ Problem data parsing failed, using defaults: {e}")
            # Create minimal valid problem
            return ProblemData(
                index=raw_problem.get("index", 0),
                question=raw_problem.get("question", ""),
                subject=raw_problem.get("subject", ""),
                language=raw_problem.get("language", "English")
            )
    
    def _parse_pipeline_result(self, raw_result: Dict[str, Any]) -> PipelineResult:
        """Parse raw pipeline result into Pydantic model"""
        try:
            return PipelineResult(**{
                k: v for k, v in raw_result.items() 
                if k in PipelineResult.model_fields
            })
        except Exception as e:
            self.logger.warning(f"⚠️ Pipeline result parsing failed, using defaults: {e}")
            return PipelineResult(
                prediction=raw_result.get("prediction", ""),
                confidence_score=raw_result.get("confidence_score", 0.5),
                quality_score=raw_result.get("quality_score", 0.5)
            )
    
    def _prepare_image_content(self, problem: ProblemData) -> List[Dict]:
        """Prepare image content for API calls"""
        content = []
        
        vision_relevance = problem.vision_relevance
        if vision_relevance in ['necessary', 'helpful', 'essential', 'optional']:
            image_paths = problem.image_path
            
            for img_path in image_paths:
                full_path = os.path.join(self.images_base_path, img_path) if self.images_base_path else img_path
                
                if os.path.exists(full_path):
                    try:
                        base64_image = self._encode_image_to_base64(full_path)
                        if base64_image:
                            mime_type = self._get_image_mime_type(full_path)
                            content.append({
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{base64_image}",
                                    "detail": "high"
                                }
                            })
                            self.logger.debug(f"✓ Added image: {img_path}")
                    except Exception as e:
                        self.logger.warning(f"⚠️ Failed to load image {img_path}: {e}")
        
        return content
    
    def _get_subject_context(self, subject_code: str, language: str = "en") -> tuple:
        """Get subject context and guidance"""
        if subject_code and subject_code in PHYSICS_SUBJECTS:
            subject_name = PHYSICS_SUBJECTS[subject_code]
            subject_context = f"{subject_code} ({subject_name})"
            subject_guidance = SUBJECT_GUIDANCE.get(subject_code, {}).get(language, "Apply general physics principles")
        else:
            if language == "zh":
                subject_context = "未指定学科"
                subject_guidance = "应用一般物理原理"
            else:
                subject_context = "Unspecified subject"
                subject_guidance = "Apply general physics principles"
        
        return subject_context, subject_guidance
    
    def _encode_image_to_base64(self, image_path: str) -> Optional[str]:
        """Encode image to base64"""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            self.logger.error(f"Error encoding image {image_path}: {e}")
            return None
    
    def _get_image_mime_type(self, file_path: str) -> str:
        """Get MIME type for image"""
        ext = os.path.splitext(file_path)[1].lower()
        mime_types = {
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.gif': 'image/gif',
            '.webp': 'image/webp'
        }
        return mime_types.get(ext, 'image/png')
    
    async def _stage1_evaluation(self, state: ComparisonState) -> ComparisonState:
        """
        Stage 1: Evaluate both predictions for correctness
        SINGLE O3-MINI CALL - Evaluation only
        """
        problem = state["problem"]
        openai_result = state["openai_result"]
        gemini_result = state["gemini_result"]
        
        # Extract predictions (ignore final_decision as specified)
        openai_prediction = openai_result.prediction
        gemini_prediction = gemini_result.prediction
        
        subject_context, subject_guidance = self._get_subject_context(
            problem.subject, "zh" if problem.language == "Chinese" else "en"
        )
        
        # Create evaluation prompt
        evaluation_prompt = self._create_evaluation_prompt(
            problem, openai_prediction, gemini_prediction, subject_context, subject_guidance
        )
        
        try:
            # Prepare content with images - SINGLE O3-MINI CALL
            content = [{"type": "text", "text": evaluation_prompt}]
            image_content = self._prepare_image_content(problem)
            content.extend(image_content)
            
            # Get structured evaluation using EVALUATOR model (o3-mini)
            response = await self.evaluator_client.with_structured_output(ComparisonEvaluationResult).ainvoke([
                SystemMessage(content=self._get_evaluation_system_prompt(problem.language)),
                HumanMessage(content=content)
            ])
            
            state["evaluation_result"] = response
            
            self.logger.info(f"📊 Stage 1 Complete (1 {self.evaluator_model} call) - OpenAI: {response.openai_evaluation.is_correct.value}, "
                           f"Gemini: {response.gemini_evaluation.is_correct.value}")
            
        except Exception as e:
            self.logger.error(f"❌ Stage 1 evaluation failed: {e}")
            # Create fallback evaluation using Pydantic
            state["evaluation_result"] = ComparisonEvaluationResult(
                openai_evaluation=SingleEvaluationResult(
                    pipeline_source=PipelineSource.OPENAI,
                    is_correct=CorrectnessDecision.INCORRECT,
                    confidence_assessment=0.5,
                    physics_accuracy_score=0.5,
                    mathematical_accuracy_score=0.5,
                    reasoning_quality_score=0.5,
                    detailed_feedback=f"Evaluation failed: {e}"
                ),
                gemini_evaluation=SingleEvaluationResult(
                    pipeline_source=PipelineSource.GEMINI,
                    is_correct=CorrectnessDecision.INCORRECT,
                    confidence_assessment=0.5,
                    physics_accuracy_score=0.5,
                    mathematical_accuracy_score=0.5,
                    reasoning_quality_score=0.5,
                    detailed_feedback=f"Evaluation failed: {e}"
                ),
                overall_comparison="Evaluation failed",
                recommendation_rationale="Default to synthesis due to evaluation failure"
            )
        
        return state
    
    async def _stage2_selection(self, state: ComparisonState) -> ComparisonState:
        """
        Stage 2: Make selection decision based on evaluation
        ZERO O3 CALLS - Pure logic only
        """
        eval_result = state["evaluation_result"]
        openai_result = state["openai_result"]
        gemini_result = state["gemini_result"]
        
        openai_correct = eval_result.openai_evaluation.is_correct == CorrectnessDecision.CORRECT
        gemini_correct = eval_result.gemini_evaluation.is_correct == CorrectnessDecision.CORRECT
        
        # Calculate composite scores using Pydantic models
        openai_score = self._calculate_composite_score(openai_result, eval_result.openai_evaluation)
        gemini_score = self._calculate_composite_score(gemini_result, eval_result.gemini_evaluation)
        
        # Apply decision logic - NO O3 CALLS
        if openai_correct and gemini_correct:
            # Both correct → Choose higher score
            if openai_score >= gemini_score:
                decision = SelectionDecision.CHOOSE_OPENAI
                selected_prediction = openai_result.prediction
                selected_source = PipelineSource.OPENAI
                rationale = f"Both correct, OpenAI score ({openai_score:.3f}) >= Gemini score ({gemini_score:.3f})"
            else:
                decision = SelectionDecision.CHOOSE_GEMINI
                selected_prediction = gemini_result.prediction
                selected_source = PipelineSource.GEMINI
                rationale = f"Both correct, Gemini score ({gemini_score:.3f}) > OpenAI score ({openai_score:.3f})"
        
        elif openai_correct and not gemini_correct:
            # Only OpenAI correct
            decision = SelectionDecision.CHOOSE_OPENAI
            selected_prediction = openai_result.prediction
            selected_source = PipelineSource.OPENAI
            rationale = "Only OpenAI prediction is correct"
        
        elif not openai_correct and gemini_correct:
            # Only Gemini correct
            decision = SelectionDecision.CHOOSE_GEMINI
            selected_prediction = gemini_result.prediction
            selected_source = PipelineSource.GEMINI
            rationale = "Only Gemini prediction is correct"
        
        else:
            # Both incorrect → Synthesis
            decision = SelectionDecision.SYNTHESIZE
            selected_prediction = None
            selected_source = None
            rationale = "Both predictions incorrect, synthesis required"
        
        # Create structured selection result using Pydantic
        selection_result = SelectionResult(
            decision=decision,
            selected_prediction=selected_prediction,
            selected_source=selected_source,
            decision_rationale=rationale,
            openai_score=openai_score,
            gemini_score=gemini_score,
            synthesis_required=(decision == SelectionDecision.SYNTHESIZE),
            audit_trail={
                "openai_correct": openai_correct,
                "gemini_correct": gemini_correct,
                "openai_score": openai_score,
                "gemini_score": gemini_score,
                "evaluation_summary": eval_result.overall_comparison
            }
        )
        
        state["selection_result"] = selection_result
        
        self.logger.info(f"🎯 Stage 2 Complete (0 O3 calls - pure logic) - Decision: {decision.value}, Rationale: {rationale}")
        
        return state
    
    async def _stage3_synthesis(self, state: ComparisonState) -> ComparisonState:
        """
        Stage 3: Generate synthesis when both predictions are incorrect
        SINGLE O3 CALL - Synthesis only (when needed)
        """
        if not state["selection_result"].synthesis_required:
            self.logger.info("🎯 Stage 3 Skipped - No synthesis required (0 calls)")
            return state
        
        problem = state["problem"]
        openai_result = state["openai_result"]
        gemini_result = state["gemini_result"]
        eval_result = state["evaluation_result"]
        
        # Create comprehensive synthesis prompt using Pydantic models
        synthesis_prompt = self._create_synthesis_prompt(
            problem, openai_result, gemini_result, eval_result
        )
        
        try:
            # Prepare content with images - SINGLE O3 CALL (full model for synthesis)
            content = [{"type": "text", "text": synthesis_prompt}]
            image_content = self._prepare_image_content(problem)
            content.extend(image_content)
            
            # Generate synthesis with Pydantic structure using GENERATOR model (o3)
            response = await self.generator_client.with_structured_output(SynthesisResult).ainvoke([
                SystemMessage(content=self._get_synthesis_system_prompt(problem.language)),
                HumanMessage(content=content)
            ])
            
            state["synthesis_result"] = response
            
            self.logger.info(f"🔬 Stage 3 Complete (1 {self.generator_model} call) - Synthesis generated with confidence: {response.synthesis_confidence:.3f}")
            
        except Exception as e:
            self.logger.error(f"❌ Stage 3 synthesis failed: {e}")
            # Create fallback synthesis using Pydantic
            state["synthesis_result"] = SynthesisResult(
                synthesized_prediction=f"Synthesis failed: {str(e)}",
                synthesis_confidence=0.0,
                synthesis_quality=0.0,
                context_used={},
                generation_notes=f"Synthesis generation failed: {e}"
            )
        
        return state
    
    def _calculate_composite_score(self, pipeline_result: PipelineResult, 
                                 evaluation: SingleEvaluationResult) -> float:
        """Calculate composite score from Pydantic pipeline result and evaluation"""
        # Get scores from pipeline using Pydantic
        confidence_score = pipeline_result.confidence_score
        quality_score = pipeline_result.quality_score
        
        # Get scores from evaluation using Pydantic
        physics_score = evaluation.physics_accuracy_score
        math_score = evaluation.mathematical_accuracy_score
        reasoning_score = evaluation.reasoning_quality_score
        
        # Weighted combination
        composite = (
            0.25 * confidence_score +
            0.25 * quality_score +
            0.30 * physics_score +
            0.10 * math_score +
            0.10 * reasoning_score
        )
        
        return composite
    
    def _create_evaluation_prompt(self, problem: ProblemData, openai_pred: str, 
                                gemini_pred: str, subject_context: str, subject_guidance: str) -> str:
        """Create prompt for Stage 1 evaluation using Pydantic models"""
        
        if problem.language == "Chinese":
            prompt = f"""**物理问题评估任务**

**原问题信息：**
• 问题: {problem.question}
• 难度等级: {problem.level}/10
• 物理学科: {subject_context}
• 问题类别: {problem.img_category}
• 语言: {problem.language}
{f"• 图像说明: {problem.caption}" if problem.caption else ""}
{f"• 有效数字要求: {problem.sig_figs}" if problem.sig_figs else ""}

**专业指导:**
{subject_guidance}

**需要评估的两个解答:**

**OpenAI 管道解答:**
{openai_pred}

**Gemini 管道解答:**
{gemini_pred}

**评估要求:**
对每个解答进行详细评估，判断其正确性并提供具体分析。重点关注：
1. 物理原理应用的正确性
2. 数学推导的准确性  
3. 推理过程的逻辑性
4. 最终答案的合理性
5. 如有图像，视觉信息解读的准确性

请提供结构化的评估结果。"""
        else:
            prompt = f"""**PHYSICS PROBLEM EVALUATION TASK**

**Original Problem Information:**
• Question: {problem.question}
• Difficulty Level: {problem.level}/10
• Physics Subject: {subject_context}
• Problem Category: {problem.img_category}
• Language: {problem.language}
{f"• Image Caption: {problem.caption}" if problem.caption else ""}
{f"• Required Significant Figures: {problem.sig_figs}" if problem.sig_figs else ""}

**Expert Guidance:**
{subject_guidance}

**Two Predictions to Evaluate:**

**OpenAI Pipeline Prediction:**
{openai_pred}

**Gemini Pipeline Prediction:**
{gemini_pred}

**Evaluation Requirements:**
Thoroughly evaluate each prediction for correctness and provide detailed analysis. Focus on:
1. Correctness of physics principle applications
2. Accuracy of mathematical derivations
3. Logical consistency of reasoning process
4. Reasonableness of final answers
5. If images provided: accuracy of visual information interpretation

Please provide structured evaluation results."""
        
        return prompt
    
    def _create_synthesis_prompt(self, problem: ProblemData, openai_result: PipelineResult,
                               gemini_result: PipelineResult, eval_result: ComparisonEvaluationResult) -> str:
        """Create comprehensive synthesis prompt for Stage 3 using Pydantic models"""
        
        subject_context, subject_guidance = self._get_subject_context(
            problem.subject, "zh" if problem.language == "Chinese" else "en"
        )
        
        # Get evaluation feedback using Pydantic
        openai_feedback = eval_result.openai_evaluation.detailed_feedback
        gemini_feedback = eval_result.gemini_evaluation.detailed_feedback
        openai_errors = ", ".join(eval_result.openai_evaluation.specific_errors) if eval_result.openai_evaluation.specific_errors else "None specified"
        gemini_errors = ", ".join(eval_result.gemini_evaluation.specific_errors) if eval_result.gemini_evaluation.specific_errors else "None specified"
        
        if problem.language == "Chinese":
            prompt = f"""**物理问题综合解答生成任务**

**原始问题:**
• 问题: {problem.question}
• 难度等级: {problem.level}/10
• 物理学科: {subject_context}
• 问题类别: {problem.img_category}
• 语言: {problem.language}
{f"• 图像说明: {problem.caption}" if problem.caption else ""}
{f"• 有效数字要求: {problem.sig_figs}" if problem.sig_figs else ""}

**专业指导:**
{subject_guidance}

**之前的错误尝试分析:**

**OpenAI 管道失败分析:**
• 最终预测: {openai_result.prediction}...
• 信心分数: {openai_result.confidence_score}
• 质量分数: {openai_result.quality_score}
• 评估反馈: {openai_feedback}
• 具体错误: {openai_errors}
• 尝试次数: {len(openai_result.generation_history)}

**Gemini 管道失败分析:**
• 最终预测: {gemini_result.prediction}...
• 信心分数: {gemini_result.confidence_score}
• 质量分数: {gemini_result.quality_score}
• 评估反馈: {gemini_feedback}
• 具体错误: {gemini_errors}
• 尝试次数: {len(gemini_result.generation_history)}

**综合评估建议:**
{eval_result.overall_comparison}

**生成要求:**
基于上述失败分析，生成一个正确的物理解答。避免之前发现的所有错误，确保：
1. 物理原理应用正确
2. 数学推导准确无误
3. 推理逻辑清晰完整
4. 最终答案合理可信
5. 如有图像，正确解读视觉信息

请用标准格式回答：
<think>
[详细的物理分析和推导过程]
</think>

<answer>
[最终的数值答案]
</answer>"""
        
        else:
            prompt = f"""**PHYSICS PROBLEM SYNTHESIS GENERATION TASK**

**Original Problem:**
• Question: {problem.question}
• Difficulty Level: {problem.level}/10
• Physics Subject: {subject_context}
• Problem Category: {problem.img_category}
• Language: {problem.language}
{f"• Image Caption: {problem.caption}" if problem.caption else ""}
{f"• Required Significant Figures: {problem.sig_figs}" if problem.sig_figs else ""}

**Expert Guidance:**
{subject_guidance}

**Analysis of Previous Failed Attempts:**

**OpenAI Pipeline Failure Analysis:**
• Final Prediction: {openai_result.prediction}...
• Confidence Score: {openai_result.confidence_score}
• Quality Score: {openai_result.quality_score}
• Evaluation Feedback: {openai_feedback}
• Specific Errors: {openai_errors}
• Number of Attempts: {len(openai_result.generation_history)}

**Gemini Pipeline Failure Analysis:**
• Final Prediction: {gemini_result.prediction}...
• Confidence Score: {gemini_result.confidence_score}
• Quality Score: {gemini_result.quality_score}
• Evaluation Feedback: {gemini_feedback}
• Specific Errors: {gemini_errors}
• Number of Attempts: {len(gemini_result.generation_history)}

**Overall Evaluation Insights:**
{eval_result.overall_comparison}

**Generation Requirements:**
Based on the failure analysis above, generate a correct physics solution. Avoid all previously identified errors and ensure:
1. Correct physics principle applications
2. Accurate mathematical derivations
3. Clear and complete logical reasoning
4. Reasonable and credible final answers
5. If images provided: correct interpretation of visual information

Please respond in standard format:
<think>
[Detailed physics analysis and derivation process]
</think>

<answer>
[Final numerical answer]
</answer>"""
        
        return prompt
    
    def _get_evaluation_system_prompt(self, language: str) -> str:
        """Get system prompt for evaluation"""
        if language == "Chinese":
            return """你是一位专业的物理学专家和解答评估师。你的任务是评估两个不同AI系统生成的物理解答质量。

**评估目标：** 准确判断每个解答的正确性，并提供详细的分析反馈。

**评估标准：**
- 物理原理应用是否正确
- 数学推导是否准确
- 推理过程是否完整逻辑
- 最终答案是否合理
- 视觉信息解读是否准确（如适用）

**输出要求：** 提供结构化的评估结果，包含每个解答的详细分析。"""
        else:
            return """You are an expert physics specialist and solution evaluator. Your task is to assess physics solutions generated by two different AI systems.

**EVALUATION GOAL:** Accurately determine the correctness of each solution and provide detailed analytical feedback.

**EVALUATION CRITERIA:**
- Correctness of physics principle applications
- Accuracy of mathematical derivations
- Completeness and logic of reasoning process
- Reasonableness of final answers
- Accuracy of visual information interpretation (if applicable)

**OUTPUT REQUIREMENTS:** Provide structured evaluation results with detailed analysis for each solution."""
    
    def _get_synthesis_system_prompt(self, language: str) -> str:
        """Get system prompt for synthesis generation"""
        if language == "Chinese":
            return """你是一位顶级的物理学专家。你的任务是基于对之前失败尝试的详细分析，生成一个正确的物理解答。

**生成目标：** 创建一个避免所有已识别错误的高质量物理解答。

**核心要求：**
- 应用正确的物理原理和定律
- 进行准确的数学推导
- 提供清晰完整的推理过程
- 给出合理可信的最终答案
- 确保与问题要求完全吻合

你有机会纠正之前的所有错误并展示真正的物理专业水平。"""
        else:
            return """You are a top-tier physics expert. Your task is to generate a correct physics solution based on detailed analysis of previous failed attempts.

**GENERATION GOAL:** Create a high-quality physics solution that avoids all identified errors.

**CORE REQUIREMENTS:**
- Apply correct physics principles and laws
- Perform accurate mathematical derivations
- Provide clear and complete reasoning processes
- Give reasonable and credible final answers
- Ensure complete alignment with problem requirements

You have the opportunity to correct all previous errors and demonstrate true physics expertise."""
    
    async def compare_and_select(self, problem_data: Dict[str, Any], 
                                openai_result_data: Dict[str, Any], 
                                gemini_result_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Main function to compare pipeline results and select/synthesize the best solution
        MAX 2 O3 CALLS: 1 for evaluation (always) + 1 for synthesis (if both incorrect)
        """
        
        # Parse inputs into Pydantic models for validation and structure
        problem = self._parse_problem_data(problem_data)
        openai_result = self._parse_pipeline_result(openai_result_data)
        gemini_result = self._parse_pipeline_result(gemini_result_data)
        
        initial_state: ComparisonState = {
            "problem": problem,
            "openai_result": openai_result,
            "gemini_result": gemini_result,
            "evaluation_result": None,
            "selection_result": None,
            "synthesis_result": None,
            "final_result": None
        }
        
        try:
            self.logger.info(f"🔍 Starting comparison for problem {problem.index} (MAX 2 O3 calls)")
            
            # Stage 1: Evaluation (1 O3 call - always)
            state = await self._stage1_evaluation(initial_state)
            
            # Stage 2: Selection Decision (0 O3 calls - pure logic)
            state = await self._stage2_selection(state)
            
            # Stage 3: Synthesis (1 O3 call - only if both incorrect)
            state = await self._stage3_synthesis(state)
            
            # Prepare final result using Pydantic
            final_result = self._prepare_final_result(state)
            state["final_result"] = final_result
            
            # Count actual O3 calls made
            calls_made = 1  # Always 1 for evaluation
            if state["selection_result"].synthesis_required:
                calls_made += 1  # +1 for synthesis
            
            self.logger.info(f"✅ Comparison complete for problem {problem.index} ({calls_made}/2 O3 calls used)")
            
            return final_result.dict()
            
        except Exception as e:
            self.logger.error(f"❌ Comparison failed for problem {problem.index}: {e}")
            
            # Return fallback result
            return self._create_fallback_result(problem_data, openai_result_data, gemini_result_data, str(e))
    
    def _prepare_final_result(self, state: ComparisonState) -> FinalResult:
        """Prepare final result with complete audit trail using Pydantic"""
        problem = state["problem"]
        selection = state["selection_result"]
        synthesis = state["synthesis_result"]
        evaluation = state["evaluation_result"]
        
        # Determine final prediction and source
        if selection.decision == SelectionDecision.SYNTHESIZE and synthesis:
            prediction = synthesis.synthesized_prediction
            source = PipelineSource.SYNTHESIS.value
            confidence_score = synthesis.synthesis_confidence
            quality_score = synthesis.synthesis_quality
            generation_notes = synthesis.generation_notes
        elif selection.selected_prediction:
            prediction = selection.selected_prediction
            source = selection.selected_source.value
            generation_notes = None
            # Use scores from selected pipeline
            if selection.selected_source == PipelineSource.OPENAI:
                confidence_score = state["openai_result"].confidence_score
                quality_score = state["openai_result"].quality_score
            else:
                confidence_score = state["gemini_result"].confidence_score
                quality_score = state["gemini_result"].quality_score
        else:
            # Fallback to OpenAI
            prediction = state["openai_result"].prediction
            source = PipelineSource.OPENAI.value
            confidence_score = state["openai_result"].confidence_score
            quality_score = state["openai_result"].quality_score
            generation_notes = "Fallback to OpenAI"
        
        # Create comparison metadata using Pydantic
        comparison_metadata = ComparisonMetadata(
            decision=selection.decision.value,
            decision_rationale=selection.decision_rationale,
            openai_score=selection.openai_score,
            gemini_score=selection.gemini_score,
            synthesis_required=selection.synthesis_required,
            audit_trail=selection.audit_trail,
            evaluation_summary=evaluation.overall_comparison if evaluation else "N/A"
        )
        
        # Create pipeline comparison data using Pydantic
        pipeline_results = {
            "openai": PipelineComparison(
                prediction=state["openai_result"].prediction + "...",
                confidence=state["openai_result"].confidence_score,
                quality=state["openai_result"].quality_score,
                correctness=evaluation.openai_evaluation.is_correct.value if evaluation else "unknown"
            ),
            "gemini": PipelineComparison(
                prediction=state["gemini_result"].prediction + "...",
                confidence=state["gemini_result"].confidence_score,
                quality=state["gemini_result"].quality_score,
                correctness=evaluation.gemini_evaluation.is_correct.value if evaluation else "unknown"
            )
        }
        
        # Create final result using Pydantic
        final_result = FinalResult(
            # Original problem data
            index=problem.index,
            question=problem.question,
            subject=problem.subject,
            image_path=problem.image_path,
            img_category=problem.img_category,
            vision_relevance=problem.vision_relevance,
            language=problem.language,
            level=problem.level,
            sig_figs=problem.sig_figs,
            caption=problem.caption,
            
            # Final selected result
            prediction=prediction,
            source=source,
            confidence_score=confidence_score,
            quality_score=quality_score,
            generation_notes=generation_notes,
            
            # Metadata
            comparison_metadata=comparison_metadata,
            pipeline_results=pipeline_results
        )
        
        return final_result
    
    def _create_fallback_result(self, problem: Dict[str, Any], openai_result: Dict[str, Any], 
                              gemini_result: Dict[str, Any], error_msg: str) -> Dict[str, Any]:
        """Create fallback result when comparison fails"""
        result = problem.copy()
        
        # Default to OpenAI result
        result["prediction"] = openai_result.get("prediction", "Comparison failed")
        result["source"] = PipelineSource.OPENAI.value
        result["confidence_score"] = openai_result.get("confidence_score", 0.0)
        result["quality_score"] = openai_result.get("quality_score", 0.0)
        result["error"] = error_msg
        
        result["comparison_metadata"] = {
            "decision": "fallback_to_openai",
            "decision_rationale": f"Comparison failed: {error_msg}",
            "openai_score": 0.0,
            "gemini_score": 0.0,
            "synthesis_required": False,
            "audit_trail": {"error": error_msg},
            "evaluation_summary": "Comparison process failed"
        }
        
        return result

async def run_multi_pipeline_comparison(openai_results: List[Dict[str, Any]], 
                                      gemini_results: List[Dict[str, Any]],
                                      output_file: str = "comparison_results.json",
                                      images_base_path: str = "",
                                      api_key: Optional[str] = None,
                                      evaluator_model: str = "o3-mini-2025-01-31",
                                      generator_model: str = "o3-2025-04-16",
                                      batch_size: int = 5,
                                      log_file: str = "ensemble.log") -> List[Dict]:
    """
    Main function to run multi-pipeline comparison
    COST OPTIMIZED: o3-mini for evaluation + o3 for synthesis (when both incorrect)
    """
    
    # Setup logging
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler()
        ]
    )
    
    # Get API key
    if not api_key:
        api_key = os.environ.get("OPENAI_API_KEY")
        if not api_key:
            raise ValueError("OPENAI_API_KEY environment variable not set")
    
    # Create index mappings for easy lookup using Pydantic validation
    openai_by_index = {}
    gemini_by_index = {}
    
    for r in openai_results:
        try:
            idx = r.get("index")
            if idx is not None:
                openai_by_index[idx] = r
        except Exception as e:
            logger.warning(f"⚠️ Skipping invalid OpenAI result: {e}")
    
    for r in gemini_results:
        try:
            idx = r.get("index")
            if idx is not None:
                gemini_by_index[idx] = r
        except Exception as e:
            logger.warning(f"⚠️ Skipping invalid Gemini result: {e}")
    
    # Find common problems
    common_indices = set(openai_by_index.keys()) & set(gemini_by_index.keys())
    common_indices = sorted([idx for idx in common_indices if idx is not None])

    existing_results = []
    processed_indices = set()

    if os.path.exists(output_file):
        logger.info(f"📁 Found existing comparison results: {output_file}")
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                existing_results = json.load(f)
            processed_indices = {result.get('index') for result in existing_results}
            logger.info(f"✅ Loaded {len(existing_results)} existing comparison results")
        except Exception as e:
            logger.warning(f"⚠️ Error loading existing results: {e}")

    # Filter to only unprocessed problems
    common_indices = [idx for idx in common_indices if idx not in processed_indices]
    logger.info(f"🎯 Remaining problems to compare: {len(common_indices)}")

    logger.info(f"🔍 Found {len(common_indices)} common problems to compare")
    logger.info(f"📊 OpenAI results: {len(openai_results)}, Gemini results: {len(gemini_results)}")
    logger.info(f"🧠 Models: Evaluator={evaluator_model}, Generator={generator_model}")
    logger.info(f"💰 Cost: {evaluator_model} for evaluation + {generator_model} for synthesis (when needed)")
    
    if not common_indices:
        logger.warning("⚠️ No common problems found between pipelines!")
        return []
    
    # Initialize comparator with separate models
    comparator = MultiPipelineComparator(
        api_key=api_key,
        evaluator_model=evaluator_model,
        generator_model=generator_model,
        images_base_path=images_base_path,
        logger=logger
    )
    
    # Process in batches
    results = []
    results = existing_results.copy()
    batches = [common_indices[i:i + batch_size] for i in range(0, len(common_indices), batch_size)]
    
    total_evaluator_calls = 0
    total_generator_calls = 0
    
    for batch_num, batch_indices in enumerate(tqdm(batches, desc="Comparing Batches"), 1):
        logger.info(f"🚀 Processing batch {batch_num}/{len(batches)} with {len(batch_indices)} problems")
        
        batch_tasks = []
        for idx in batch_indices:
            openai_result = openai_by_index[idx]
            gemini_result = gemini_by_index[idx]
            
            # Extract problem data (same in both results) - use Pydantic parsing
            problem_data = {k: v for k, v in openai_result.items() 
                           if k not in ['prediction', 'iterations_used', 'total_generations', 
                                       'generation_history', 'final_decision', 'confidence_score', 
                                       'quality_score', 'evaluation_feedback']}
            
            task = comparator.compare_and_select(problem_data, openai_result, gemini_result)
            batch_tasks.append(task)
        
        # Execute batch
        try:
            batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
            
            # Process results and count calls
            batch_evaluator_calls = 0
            batch_generator_calls = 0
            
            for i, result in enumerate(batch_results):
                if isinstance(result, Exception):
                    logger.error(f"❌ Problem {batch_indices[i]} failed: {result}")
                    # Create error result using Pydantic fallback
                    problem_idx = batch_indices[i]
                    error_result = openai_by_index[problem_idx].copy()
                    error_result["comparison_error"] = str(result)
                    error_result["source"] = "error_fallback"
                    results.append(error_result)
                    batch_evaluator_calls += 1  # At least 1 evaluator call was attempted
                else:
                    results.append(result)
                    # Count actual calls: 1 evaluator + 1 generator if synthesis was required
                    batch_evaluator_calls += 1  # Always 1 evaluator call
                    if result.get("comparison_metadata", {}).get("synthesis_required", False):
                        batch_generator_calls += 1
            
            total_evaluator_calls += batch_evaluator_calls
            total_generator_calls += batch_generator_calls
            
            logger.info(f"✅ Batch {batch_num} completed: {len(batch_results)} results")
            logger.info(f"   Calls: {batch_evaluator_calls} {evaluator_model} + {batch_generator_calls} {generator_model}")
            
            # Save progress
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            
            # Small delay between batches
            await asyncio.sleep(5)
            
        except Exception as e:
            logger.error(f"❌ Batch {batch_num} failed completely: {e}")
            continue
    
    # Final save and statistics using Pydantic validation
    results.sort(key=lambda x: x.get('index', 0))
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    # Generate statistics
    total = len(results)
    source_counts = {}
    decision_counts = {}
    synthesis_count = 0
    
    for result in results:
        source = result.get("source", "unknown")
        source_counts[source] = source_counts.get(source, 0) + 1
        
        decision = result.get("comparison_metadata", {}).get("decision", "unknown")
        decision_counts[decision] = decision_counts.get(decision, 0) + 1
        
        if result.get("comparison_metadata", {}).get("synthesis_required", False):
            synthesis_count += 1
    
    total_calls = total_evaluator_calls + total_generator_calls
    avg_calls_per_problem = total_calls / total if total > 0 else 0
    
    logger.info(f"\n📊 Multi-Pipeline Comparison Results:")
    logger.info(f"   Total Problems Compared: {total}")
    logger.info(f"   Model Usage:")
    logger.info(f"     • {evaluator_model}: {total_evaluator_calls} calls (evaluation)")
    logger.info(f"     • {generator_model}: {total_generator_calls} calls (synthesis)")
    logger.info(f"   Total API Calls: {total_calls}")
    logger.info(f"   Average Calls/Problem: {avg_calls_per_problem:.2f}")
    logger.info(f"   Synthesis Required: {synthesis_count}/{total} ({synthesis_count/total*100:.1f}%)")
    logger.info(f"   Source Distribution: {source_counts}")
    logger.info(f"   Decision Distribution: {decision_counts}")
    logger.info(f"   Results saved to: {output_file}")
    
    return results

# Wrapper function for Jupyter use
def run_comparison(openai_results: List[Dict[str, Any]], 
                  gemini_results: List[Dict[str, Any]],
                  output_file: str = "comparison_results.json",
                  images_base_path: str = "",
                  evaluator_model: str = "o4-mini-2025-04-16",
                  generator_model: str = "o3-2025-04-16",
                  batch_size: int = 5):
    """
    Wrapper function to run comparison in Jupyter notebooks
    COST OPTIMIZED: Uses o3-mini for evaluation and o3 for synthesis
    """
    return asyncio.run(run_multi_pipeline_comparison(
        openai_results=openai_results,
        gemini_results=gemini_results,
        output_file=output_file,
        images_base_path=images_base_path,
        evaluator_model=evaluator_model,
        generator_model=generator_model,
        batch_size=batch_size
    ))

In [13]:
# Load your results
openai_results = json.load(open("///mnt/c/Personal/Competitions/ICML_Track2/NB0019_async/final_prediction.json"))
gemini_results = json.load(open("///mnt/c/Personal/Competitions/ICML_Track2/NB0021_fixed/final_prediction.json"))

# Run comparison
final_results = run_comparison(
    openai_results=openai_results,
    gemini_results=gemini_results,
    output_file="ensemble_results_old.json",
    images_base_path="///mnt/c/Personal/Competitions/ICML_Track2/input/starting_kit_latest/",
    batch_size=3
)

2025-06-29 00:17:50,461 - INFO - 📁 Found existing comparison results: ensemble_results_old.json
2025-06-29 00:17:50,711 - INFO - ✅ Loaded 1934 existing comparison results
2025-06-29 00:17:50,712 - INFO - 🎯 Remaining problems to compare: 66
2025-06-29 00:17:50,713 - INFO - 🔍 Found 66 common problems to compare
2025-06-29 00:17:50,713 - INFO - 📊 OpenAI results: 2000, Gemini results: 2000
2025-06-29 00:17:50,714 - INFO - 🧠 Models: Evaluator=o4-mini-2025-04-16, Generator=o3-2025-04-16
2025-06-29 00:17:50,714 - INFO - 💰 Cost: o4-mini-2025-04-16 for evaluation + o3-2025-04-16 for synthesis (when needed)


Comparing Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2025-06-29 00:17:50,986 - INFO - 🚀 Processing batch 1/22 with 3 problems
2025-06-29 00:17:50,987 - INFO - 🔍 Starting comparison for problem 0 (MAX 2 O3 calls)
2025-06-29 00:17:50,998 - INFO - 🔍 Starting comparison for problem 1 (MAX 2 O3 calls)
2025-06-29 00:17:51,006 - INFO - 🔍 Starting comparison for problem 2 (MAX 2 O3 calls)
2025-06-29 00:18:05,342 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-29 00:18:05,355 - INFO - 📊 Stage 1 Complete (1 o4-mini-2025-04-16 call) - OpenAI: correct, Gemini: correct
2025-06-29 00:18:05,356 - INFO - 🎯 Stage 2 Complete (0 O3 calls - pure logic) - Decision: choose_gemini, Rationale: Both correct, Gemini score (0.938) > OpenAI score (0.815)
2025-06-29 00:18:05,356 - INFO - 🎯 Stage 3 Skipped - No synthesis required (0 calls)
2025-06-29 00:18:05,357 - INFO - ✅ Comparison complete for problem 2 (1/2 O3 calls used)
/tmp/ipykernel_26664/3997448401.py:783: PydanticDeprecatedSince20: The `dict` method is depre