In [1]:
%pip install --upgrade --quiet openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import os
from PIL import Image
from IPython.display import HTML, Image, Markdown, display

In [4]:
import json
import time
import re
from typing import Dict, List, Any, Set, Optional
import PIL.Image
from pydantic import BaseModel, ValidationError
import asyncio
import aiohttp
import base64
from openai import OpenAI


In [None]:
os.environ["OPENAI_API_KEY"] = 'add_secret_key'

In [6]:
MODEL_ID = "o3-2025-04-16"

The last thing we need to configure is to fill in the model ID. Currently the latest version of APS April 17 official name is as in cell below. All model names can be checked in the model card on Kaggle.

In [7]:
import json
import pandas as pd
def load_public_data():
    # with open('///mnt/c/Personal/Competitions/ICML_Track2/input/mini.json', 'r') as file:
    with open('///mnt/c/Personal/Competitions/ICML_Track2/input/starting_kit_latest/total.json', 'r') as file:
        data = json.load(file)
    data = pd.DataFrame(data)
    problems = data.to_dict('records')
    return problems
problems = load_public_data()
# problems

In [8]:
# !pip install langgraph 
# !pip install langchain
# !pip install langchain-core
# !pip install langchain-openai

In [9]:
import os
import json
import time
import base64
import re
import logging
from typing import Dict, List, Any, Optional, TypedDict, Annotated, Union, Set, Tuple
from enum import Enum
from dataclasses import dataclass
from tqdm.notebook import tqdm

from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.runnables import RunnableConfig
import operator

In [10]:
# Setup logging
def setup_logging(log_file: str = "total_physics_solver.log", verbose: bool = True):
    """Setup logging configuration"""
    os.makedirs(os.path.dirname(log_file) if os.path.dirname(log_file) else '.', exist_ok=True)
    
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler() if verbose else logging.NullHandler()
        ],
        force=True
    )
    return logging.getLogger(__name__)

# Physics Domain Definitions (same as before)
PHYSICS_SUBJECTS = {
    'O': 'Optics (Basic)',
    'OPT': 'Optics (Extended/Advanced)', 
    'EM': 'Electromagnetism',
    'CM': 'Classical Mechanics',
    'TSM': 'Thermodynamics & Statistical Mechanics',
    'QMIT': 'Quantum Mechanics & Information Theory',
    'ACG': 'Astrophysics, Cosmology & Gravitation',
    'AMONP': 'Atomic, Molecular, Optical & Nuclear Physics'
}

PHYSICS_CATEGORIES = {
    'MECHANICS': [
        'static_force_analysis', 'spring_force', 'circular_motion', 
        'linear_motion', 'coordinate_system', 'simple_harmonic_motion', 
        'projectile_motion'
    ],
    'ELECTROMAGNETISM': [
        'circuit_diagram', 'charge_distribution', 'magnetic_circuit', 
        'electromagnetic_field', 'capacitance_resistance'
    ],
    'WAVES_OPTICS': [
        'optical_path', 'wave_motion', 'photoelectric_effect', 'acoustics'
    ],
    'THERMAL': ['thermodynamics'],
    'MODERN': [
        'atomic_physics', 'quantum_mechanics', 'relativity_gravity', 
        'feynman_diagram', 'astrophysics'
    ]
}

CATEGORY_EXPERTISE = {
    'static_force_analysis': {
        'en': 'Force equilibrium, vector analysis, torque calculations, structural mechanics, friction, constraint forces',
        'zh': '力平衡，矢量分析，力矩计算，结构力学，摩擦力，约束力'
    },
    'spring_force': {
        'en': 'Hooke\'s law, elastic energy, harmonic oscillations, coupled systems, resonance',
        'zh': '胡克定律，弹性能，谐振荡，耦合系统，共振'
    },
    'circular_motion': {
        'en': 'Centripetal force, angular momentum, rotational dynamics, orbital mechanics',
        'zh': '向心力，角动量，转动动力学，轨道力学'
    },
    'linear_motion': {
        'en': 'Kinematics, dynamics, momentum conservation, collision analysis',
        'zh': '运动学，动力学，动量守恒，碰撞分析'
    },
    'coordinate_system': {
        'en': 'Graph analysis, data interpretation, coordinate transformations, vector fields',
        'zh': '图形分析，数据解释，坐标变换，矢量场'
    },
    'simple_harmonic_motion': {
        'en': 'Oscillation equations, period analysis, energy methods, damping effects',
        'zh': '振动方程，周期分析，能量方法，阻尼效应'
    },
    'projectile_motion': {
        'en': 'Parabolic trajectories, range calculations, angle optimization, air resistance',
        'zh': '抛物轨迹，射程计算，角度优化，空气阻力'
    },
    'circuit_diagram': {
        'en': 'Ohm\'s law, Kirchhoff\'s laws, impedance analysis, AC/DC circuits, network analysis',
        'zh': '欧姆定律，基尔霍夫定律，阻抗分析，交直流电路，网络分析'
    },
    'charge_distribution': {
        'en': 'Electric fields, Gauss\'s law, potential calculations, boundary conditions',
        'zh': '电场，高斯定律，电势计算，边界条件'
    },
    'magnetic_circuit': {
        'en': 'Magnetic flux, inductance, transformer principles, magnetic coupling',
        'zh': '磁通量，电感，变压器原理，磁耦合'
    },
    'electromagnetic_field': {
        'en': 'Maxwell equations, wave propagation, Lorentz force, field interactions',
        'zh': '麦克斯韦方程，波传播，洛伦兹力，场相互作用'
    },
    'capacitance_resistance': {
        'en': 'Dielectric properties, resistance networks, Hall effect, field distributions',
        'zh': '介电性质，电阻网络，霍尔效应，场分布'
    },
    'optical_path': {
        'en': 'Ray tracing, lens systems, interference, diffraction, polarization',
        'zh': '光线追迹，透镜系统，干涉，衍射，偏振'
    },
    'wave_motion': {
        'en': 'Wave equations, superposition, standing waves, Doppler effect',
        'zh': '波动方程，叠加原理，驻波，多普勒效应'
    },
    'photoelectric_effect': {
        'en': 'Einstein equation, photon energy, quantum nature of light, work function',
        'zh': '爱因斯坦方程，光子能量，光的量子性，功函数'
    },
    'acoustics': {
        'en': 'Sound propagation, acoustic resonance, sound intensity, echo analysis',
        'zh': '声传播，声共振，声强，回声分析'
    },
    'thermodynamics': {
        'en': 'Laws of thermodynamics, heat transfer, phase transitions, statistical mechanics',
        'zh': '热力学定律，传热，相变，统计力学'
    },
    'atomic_physics': {
        'en': 'Nuclear structure, radioactive decay, particle interactions, cross-sections',
        'zh': '核结构，放射性衰变，粒子相互作用，截面'
    },
    'quantum_mechanics': {
        'en': 'Schrödinger equation, wave functions, quantum states, uncertainty principle',
        'zh': '薛定谔方程，波函数，量子态，不确定性原理'
    },
    'relativity_gravity': {
        'en': 'Special/general relativity, spacetime, gravitational effects, reference frames',
        'zh': '狭义/广义相对论，时空，引力效应，参考系'
    },
    'feynman_diagram': {
        'en': 'Particle interactions, conservation laws, quantum field theory, decay processes',
        'zh': '粒子相互作用，守恒定律，量子场论，衰变过程'
    },
    'astrophysics': {
        'en': 'Stellar physics, cosmology, orbital mechanics, astronomical observations',
        'zh': '恒星物理，宇宙学，轨道力学，天文观测'
    }
}

SUBJECT_GUIDANCE = {
    'O': {
        'en': 'Focus on basic geometric optics, ray tracing, and fundamental optical phenomena',
        'zh': '专注于基础几何光学、光线追踪和基本光学现象'
    },
    'OPT': {
        'en': 'Apply advanced optics: wave optics, interference, diffraction, quantum optics',
        'zh': '应用高级光学：波动光学、干涉、衍射、量子光学'
    },
    'EM': {
        'en': 'Emphasize electromagnetic fields, circuits, Maxwell equations, wave propagation',
        'zh': '强调电磁场、电路、麦克斯韦方程、波传播'
    },
    'CM': {
        'en': 'Focus on classical mechanics: forces, motion, energy, momentum conservation',
        'zh': '专注于经典力学：力、运动、能量、动量守恒'
    },
    'TSM': {
        'en': 'Apply thermodynamic laws, statistical mechanics, heat transfer principles',
        'zh': '应用热力学定律、统计力学、传热原理'
    },
    'QMIT': {
        'en': 'Use quantum mechanics, wave-particle duality, quantum information theory',
        'zh': '使用量子力学、波粒二象性、量子信息理论'
    },
    'ACG': {
        'en': 'Apply gravitational physics, cosmology, relativity, astronomical principles',
        'zh': '应用引力物理、宇宙学、相对论、天文学原理'
    },
    'AMONP': {
        'en': 'Focus on atomic structure, nuclear physics, particle interactions',
        'zh': '专注于原子结构、核物理、粒子相互作用'
    }
}

class EvaluationDecision(str, Enum):
    ACCEPT = "accept"
    REJECT = "reject"

class PhysicsErrorType(str, Enum):
    PHYSICS_THEOREM = "physics_theorem_error"
    CONDITION_ANALYSIS = "condition_analysis_error"
    PROCESS_UNDERSTANDING = "process_understanding_error"
    CALCULATION = "calculation_error"
    VARIABLE_RELATIONSHIP = "variable_relationship_error"
    DIAGRAM_ANALYSIS = "diagram_analysis_error"
    BOUNDARY_CONDITIONS = "boundary_conditions_error"

class PhysicsErrorAnalysis(BaseModel):
    physics_theorem_errors: List[str] = Field(default_factory=list, description="Wrong physics laws/formulas applied")
    condition_analysis_errors: List[str] = Field(default_factory=list, description="Misidentified forces, boundaries, setup")
    process_understanding_errors: List[str] = Field(default_factory=list, description="Misunderstood physical phenomena")
    calculation_errors: List[str] = Field(default_factory=list, description="Mathematical derivation errors")
    variable_relationship_errors: List[str] = Field(default_factory=list, description="Wrong dependencies between quantities")
    diagram_analysis_errors: List[str] = Field(default_factory=list, description="Misread visual information")
    boundary_conditions_errors: List[str] = Field(default_factory=list, description="Ignored constraints/limits")

class EvaluationResult(BaseModel):
    decision: EvaluationDecision
    confidence_score: float = Field(ge=0, le=1)
    quality_score: float = Field(ge=0, le=1)
    physics_errors: PhysicsErrorAnalysis
    answer_consistency: bool
    magnitude_reasonable: bool
    error_location: Optional[str] = None
    feedback_message: str
    improvement_suggestions: Optional[str] = None

class SolutionState(TypedDict):
    problem: Dict[str, Any]
    current_solution: Optional[str]
    evaluation_result: Optional[EvaluationResult]
    final_solution: Optional[str]
    iteration_count: int
    max_iterations: int
    generation_history: List[str]
    messages: Annotated[List[BaseMessage], operator.add]

class LabeledExampleManager:
    """Manages labeled examples for few-shot enhancement"""
    
    def __init__(self, labeled_samples_path: str, logger: Optional[logging.Logger] = None):
        self.samples = []
        self.subject_category_index = {}
        self.subject_index = {}
        self.category_index = {}
        self.logger = logger or logging.getLogger(__name__)
        
        if labeled_samples_path and os.path.exists(labeled_samples_path):
            self._load_samples(labeled_samples_path)
            self._build_indices()
    
    def _load_samples(self, file_path: str):
        """Load labeled samples from JSON file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                self.samples = json.load(f)
            self.logger.info(f"✅ Loaded {len(self.samples)} labeled examples")
        except Exception as e:
            self.logger.error(f"⚠️ Failed to load labeled samples: {e}")
            self.samples = []
    
    def _build_indices(self):
        """Build indices for fast lookup"""
        for sample in self.samples:
            subject = sample.get('subject', '')
            category = sample.get('img_category', '')
            
            # Subject + Category index
            key = f"{subject}_{category}"
            if key not in self.subject_category_index:
                self.subject_category_index[key] = []
            self.subject_category_index[key].append(sample)
            
            # Subject-only index
            if subject not in self.subject_index:
                self.subject_index[subject] = []
            self.subject_index[subject].append(sample)
            
            # Category-only index
            if category not in self.category_index:
                self.category_index[category] = []
            self.category_index[category].append(sample)
    
    def _calculate_quality_score(self, sample: Dict[str, Any], target_problem: Dict[str, Any]) -> float:
        """Calculate quality score for a sample"""
        score = 0.0
        
        # Reasoning length/completeness (longer reasoning usually better)
        reasoning = sample.get('reasoning', '')
        if len(reasoning) > 100:
            score += 0.3
        elif len(reasoning) > 50:
            score += 0.2
        elif len(reasoning) > 20:
            score += 0.1
        
        # Level preference (prefer same or slightly higher level)
        sample_level = sample.get('level', 1)
        target_level = target_problem.get('level', 1)
        level_diff = abs(sample_level - target_level)
        if level_diff == 0:
            score += 0.3
        elif level_diff <= 1:
            score += 0.2
        elif level_diff <= 2:
            score += 0.1
        
        # Vision relevance match
        sample_vision = sample.get('vision_relevance', '')
        target_vision = target_problem.get('vision_relevance', '')
        if sample_vision == target_vision:
            score += 0.2
        elif sample_vision in ['necessary', 'helpful'] and target_vision in ['necessary', 'helpful']:
            score += 0.1
        
        # Language match
        if sample.get('language', '') == target_problem.get('language', ''):
            score += 0.1
        
        return score
    
    def find_best_examples(self, problem: Dict[str, Any], max_examples: int = 3) -> List[Dict[str, Any]]:
        """Find best examples based on subject and category matching"""
        if not self.samples:
            return []
        
        subject = problem.get('subject', '')
        category = problem.get('img_category', '')
        
        candidates = []
        
        # Priority 1: Exact subject + category match
        exact_key = f"{subject}_{category}"
        if exact_key in self.subject_category_index:
            candidates.extend(self.subject_category_index[exact_key])
            match_type = "exact"
        
        # Priority 2: Subject match only
        elif subject in self.subject_index:
            candidates.extend(self.subject_index[subject])
            match_type = "subject"
        
        # Priority 3: Category match only  
        elif category in self.category_index:
            candidates.extend(self.category_index[category])
            match_type = "category"
        
        # No matches - skip examples
        else:
            self.logger.debug(f"🔍 No examples found for subject='{subject}', category='{category}'")
            return []
        
        # Calculate quality scores and sort
        scored_candidates = []
        for candidate in candidates:
            score = self._calculate_quality_score(candidate, problem)
            scored_candidates.append((score, candidate))
        
        # Sort by score (descending) and take top examples
        scored_candidates.sort(key=lambda x: x[0], reverse=True)
        selected = [candidate for score, candidate in scored_candidates[:max_examples]]
        
        self.logger.debug(f"🎯 Found {len(selected)} examples ({match_type} match) for {subject}/{category}")
        
        return selected
    
    def format_examples_for_prompt(self, examples: List[Dict[str, Any]], language: str = "English") -> str:
        """Format examples for inclusion in generation prompt"""
        if not examples:
            return ""
        
        lang_code = "zh" if language == "Chinese" else "en"
        
        if lang_code == "zh":
            examples_text = "\n**专家解题示例：**\n\n"
            examples_text += "以下是类似问题的专家解题过程，请参考其推理方法和解答模式：\n\n"
        else:
            examples_text = "\n**EXPERT SOLUTION EXAMPLES:**\n\n"
            examples_text += "Here are expert solutions to similar problems. Use these as guides for reasoning patterns and solution approaches:\n\n"
        
        for i, example in enumerate(examples, 1):
            question = example.get('question', '')[:200] + "..." if len(example.get('question', '')) > 200 else example.get('question', '')
            reasoning = example.get('reasoning', '')
            answer = example.get('answer', '')
            subject = example.get('subject', '')
            category = example.get('img_category', '')
            
            if lang_code == "zh":
                examples_text += f"**示例 {i}** [{subject}/{category}]:\n"
                examples_text += f"问题: {question}\n"
                examples_text += f"专家推理: {reasoning}\n"
                examples_text += f"答案: {answer}\n\n"
            else:
                examples_text += f"**Example {i}** [{subject}/{category}]:\n"
                examples_text += f"Question: {question}\n"
                examples_text += f"Expert Reasoning: {reasoning}\n"
                examples_text += f"Answer: {answer}\n\n"
        
        if lang_code == "zh":
            examples_text += "现在请解决以下问题，参考上述专家的推理模式：\n"
        else:
            examples_text += "Now solve the following problem, following the expert reasoning patterns above:\n"
        
        return examples_text

class AsyncPhysicsGenerator:
    """Async implementation of physics problem generator with feedback-enhanced regeneration"""
    
    def __init__(self, api_key: str, model: str = "o3-2025-04-16", labeled_samples_path: str = "", logger: Optional[logging.Logger] = None):
        # Configure client with timeout for better async behavior
        self.client = ChatOpenAI(
            api_key=api_key, 
            model=model,
            # timeout=120,  # 2 minute timeout
            # max_retries=2
        )
        self.api_key = api_key
        self.model = model
        self.example_manager = LabeledExampleManager(labeled_samples_path, logger) if labeled_samples_path else None
        self.logger = logger or logging.getLogger(__name__)
    
    def _get_all_valid_categories(self) -> Set[str]:
        """Get all valid physics categories"""
        categories = set()
        for category_list in PHYSICS_CATEGORIES.values():
            categories.update(category_list)
        return categories
    
    def _validate_category(self, category: str) -> bool:
        """Validate physics category"""
        return category in self._get_all_valid_categories()
    
    def _get_subject_context(self, subject_code: str, language: str = "en") -> tuple:
        """Get subject context and guidance"""
        if subject_code and subject_code in PHYSICS_SUBJECTS:
            subject_name = PHYSICS_SUBJECTS[subject_code]
            subject_context = f"{subject_code} ({subject_name})"
            subject_guidance = SUBJECT_GUIDANCE.get(subject_code, {}).get(language, "Apply general physics principles")
        else:
            if language == "zh":
                subject_context = "未指定学科"
                subject_guidance = "应用一般物理原理"
            else:
                subject_context = "Unspecified subject"
                subject_guidance = "Apply general physics principles"
        
        return subject_context, subject_guidance
    
    def _get_category_expertise(self, category: str, language: str = "en") -> str:
        """Get category-specific expertise"""
        lang_key = "zh" if language == "zh" else "en"
        return CATEGORY_EXPERTISE.get(category, {}).get(lang_key, "General physics principles")
    
    def _format_previous_attempts_feedback(self, previous_attempts: List[str], evaluation: EvaluationResult, language: str = "English") -> str:
        """Format previous attempts and evaluation feedback for regeneration"""
        if not previous_attempts or not evaluation:
            return ""
        
        lang_code = "zh" if language == "Chinese" else "en"
        
        if lang_code == "zh":
            feedback = "\n**前次尝试分析：**\n\n"
            feedback += "您的前次尝试被拒绝。以下是问题所在：\n\n"
            
            # Show specific errors
            errors = evaluation.physics_errors
            if errors.physics_theorem_errors:
                feedback += f"❌ **物理定理错误**: {', '.join(errors.physics_theorem_errors)}\n"
            if errors.condition_analysis_errors:
                feedback += f"❌ **条件分析错误**: {', '.join(errors.condition_analysis_errors)}\n"
            if errors.process_understanding_errors:
                feedback += f"❌ **过程理解错误**: {', '.join(errors.process_understanding_errors)}\n"
            if errors.calculation_errors:
                feedback += f"❌ **计算错误**: {', '.join(errors.calculation_errors)}\n"
            if errors.variable_relationship_errors:
                feedback += f"❌ **变量关系错误**: {', '.join(errors.variable_relationship_errors)}\n"
            if errors.diagram_analysis_errors:
                feedback += f"❌ **图表分析错误**: {', '.join(errors.diagram_analysis_errors)}\n"
            if errors.boundary_conditions_errors:
                feedback += f"❌ **边界条件错误**: {', '.join(errors.boundary_conditions_errors)}\n"
            
            # Show evaluation metrics
            feedback += f"\n**评估结果**:\n"
            feedback += f"• 信心分数: {evaluation.confidence_score:.2f}\n"
            feedback += f"• 质量分数: {evaluation.quality_score:.2f}\n"
            feedback += f"• 答案一致性: {'是' if evaluation.answer_consistency else '否'}\n"
            feedback += f"• 数量级合理性: {'是' if evaluation.magnitude_reasonable else '否'}\n"
            
            # Show evaluation feedback
            feedback += f"\n**评估师反馈**: {evaluation.feedback_message}\n"
            
            # Show improvement suggestions
            if evaluation.improvement_suggestions:
                feedback += f"\n**改进建议**: {evaluation.improvement_suggestions}\n"
            
            # Show truncated previous solution
            last_attempt = previous_attempts[-1]
            truncated = last_attempt[:400] + "..." if len(last_attempt) > 400 else last_attempt
            feedback += f"\n**前次解答（已拒绝）**:\n```\n{truncated}\n```\n"
            feedback += "\n**重要指示**: 请在新解答中解决上述问题。避免重复相同错误。\n"
            
        else:
            feedback = "\n**PREVIOUS ATTEMPT ANALYSIS:**\n\n"
            feedback += "Your previous attempt was rejected. Here's what went wrong:\n\n"
            
            # Show specific errors
            errors = evaluation.physics_errors
            if errors.physics_theorem_errors:
                feedback += f"❌ **Physics Theorem Errors**: {', '.join(errors.physics_theorem_errors)}\n"
            if errors.condition_analysis_errors:
                feedback += f"❌ **Condition Analysis Errors**: {', '.join(errors.condition_analysis_errors)}\n"
            if errors.process_understanding_errors:
                feedback += f"❌ **Process Understanding Errors**: {', '.join(errors.process_understanding_errors)}\n"
            if errors.calculation_errors:
                feedback += f"❌ **Calculation Errors**: {', '.join(errors.calculation_errors)}\n"
            if errors.variable_relationship_errors:
                feedback += f"❌ **Variable Relationship Errors**: {', '.join(errors.variable_relationship_errors)}\n"
            if errors.diagram_analysis_errors:
                feedback += f"❌ **Diagram Analysis Errors**: {', '.join(errors.diagram_analysis_errors)}\n"
            if errors.boundary_conditions_errors:
                feedback += f"❌ **Boundary Conditions Errors**: {', '.join(errors.boundary_conditions_errors)}\n"
            
            # Show evaluation metrics
            feedback += f"\n**Evaluation Metrics**:\n"
            feedback += f"• Confidence Score: {evaluation.confidence_score:.2f}\n"
            feedback += f"• Quality Score: {evaluation.quality_score:.2f}\n"
            feedback += f"• Answer Consistency: {'Yes' if evaluation.answer_consistency else 'No'}\n"
            feedback += f"• Magnitude Reasonable: {'Yes' if evaluation.magnitude_reasonable else 'No'}\n"
            
            # Show evaluation feedback
            feedback += f"\n**Evaluator Feedback**: {evaluation.feedback_message}\n"
            
            # Show improvement suggestions
            if evaluation.improvement_suggestions:
                feedback += f"\n**Suggested Improvements**: {evaluation.improvement_suggestions}\n"
            
            # Show truncated previous solution
            last_attempt = previous_attempts[-1]
            truncated = last_attempt[:400] + "..." if len(last_attempt) > 400 else last_attempt
            feedback += f"\n**Previous Solution (REJECTED)**:\n```\n{truncated}\n```\n"
            feedback += "\n**CRITICAL INSTRUCTIONS**: Address the above issues in your new solution. Avoid repeating the same mistakes.\n"
        
        return feedback
    
    def _format_generation_prompt(self, problem: Dict[str, Any], 
                                 previous_attempts: Optional[List[str]] = None,
                                 evaluation_feedback: Optional[EvaluationResult] = None) -> str:
        """Format prompt for physics solution generation with optional feedback"""
        language = problem.get("language", "English")
        lang_code = "zh" if language == "Chinese" else "en"
        
        # Extract problem details
        question = problem.get("question", "")
        level = problem.get("level", 1)
        category = problem.get("img_category", "")
        subject_code = problem.get("subject", "")
        sig_figs = problem.get("sig_figs", "")
        caption = problem.get("caption", "")
        
        # Get contexts
        subject_context, subject_guidance = self._get_subject_context(subject_code, lang_code)
        category_expertise = self._get_category_expertise(category, lang_code)
        
        # Get examples if available
        examples_text = ""
        if self.example_manager:
            examples = self.example_manager.find_best_examples(problem, max_examples=3)
            if examples:
                self.logger.info(f"✅ Injecting {len(examples)} examples for problem {problem.get('index', 'Unknown')}.")
                # Log details of the first example for verification
                first_example = examples[0]
                example_log_message = (
                    f"  -> Example 1 ({first_example.get('subject')}/{first_example.get('img_category')}): "
                    f"Question snippet: '{first_example.get('question', '')}'"
                    f"Answer snippet: '{first_example.get('answer', '')}'"
                )
                self.logger.info(example_log_message)

                examples_text = self.example_manager.format_examples_for_prompt(examples, language)
            else:
                self.logger.info(f"❌ No examples found for problem {problem.get('index', 'Unknown')}. Using empty examples.")
        # Get feedback from previous attempts
        feedback_text = ""
        if previous_attempts and evaluation_feedback:
            feedback_text = self._format_previous_attempts_feedback(previous_attempts, evaluation_feedback, language)
        
        # Format components
        if lang_code == "zh":
            prompt = f"""你是一位专业的物理学导师，具有深厚的物理学知识。

**问题信息：**
• 难度等级：{level}/10
• 物理学科：{subject_context}
• 问题类别：{category}
• 语言：{language}
{f"• 图像说明：{caption}" if caption else ""}
{f"• 有效数字要求：精确到 {sig_figs} 位有效数字" if sig_figs else ""}

**专业指导：**
• 学科重点：{subject_guidance}
• 类别专长：{category_expertise}
{feedback_text}
{examples_text}

**问题：**
{question}

**分析要求：**
请提供完整的物理解答，包括：
1. 清晰的物理分析过程
2. 准确的数学推导
3. 最终的数值答案（带适当有效数字）

请用以下格式回答：
<think>
[在此提供详细的物理分析和推导过程]
</think>

<answer>
[在此提供最终的数值答案]
</answer>"""
        else:
            prompt = f"""You are an expert physics tutor with deep knowledge across all physics domains.

**PROBLEM INFORMATION:**
• Difficulty Level: {level}/10
• Physics Subject: {subject_context}
• Problem Category: {category}
• Language: {language}
{f"• Image Caption: {caption}" if caption else ""}
{f"• Significant Figures: Express answer to exactly {sig_figs} significant figures" if sig_figs else ""}

**EXPERT GUIDANCE:**
• Subject Focus: {subject_guidance}
• Category Expertise: {category_expertise}
{feedback_text}
{examples_text}

**PROBLEM:**
{question}

**ANALYSIS REQUIREMENTS:**
Provide a complete physics solution including:
1. Clear physical analysis process
2. Accurate mathematical derivations
3. Final numerical answer (with appropriate significant figures)

Please respond in the following format:
<think>
[Provide detailed physics analysis and derivation process here]
</think>

<answer>
[Provide final numerical answer here]
</answer>"""
        
        return prompt
    
    def _prepare_generation_content(self, problem: Dict[str, Any], images_base_path: str = "",
                                  previous_attempts: Optional[List[str]] = None,
                                  evaluation_feedback: Optional[EvaluationResult] = None) -> List[Dict]:
        """Prepare content for generation including images and feedback"""
        prompt = self._format_generation_prompt(problem, previous_attempts, evaluation_feedback)
        content = [{"type": "text", "text": prompt}]
        
        # Add images if present
        vision_relevance = problem.get('vision_relevance', '')
        if vision_relevance in ['necessary', 'helpful', 'optional']:
            image_paths = problem.get('image_path', [])
            
            for img_path in image_paths:
                full_path = os.path.join(images_base_path, img_path) if images_base_path else img_path
                
                if os.path.exists(full_path):
                    try:
                        base64_image = self._encode_image_to_base64(full_path)
                        if base64_image:
                            mime_type = self._get_image_mime_type(full_path)
                            content.append({
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{base64_image}",
                                    "detail": "high"
                                }
                            })
                            self.logger.debug(f"✓ Added image: {img_path}")
                    except Exception as e:
                        self.logger.warning(f"⚠️ Failed to load image {img_path}: {e}")
        
        return content
    
    def _encode_image_to_base64(self, image_path: str) -> Optional[str]:
        """Encode image to base64"""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            self.logger.error(f"Error encoding image {image_path}: {e}")
            return None
    
    def _get_image_mime_type(self, file_path: str) -> str:
        """Get MIME type for image"""
        ext = os.path.splitext(file_path)[1].lower()
        mime_types = {
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.gif': 'image/gif',
            '.webp': 'image/webp'
        }
        return mime_types.get(ext, 'image/png')
    
    async def generate_solution(self, problem: Dict[str, Any], images_base_path: str = "",
                         previous_attempts: Optional[List[str]] = None,
                         evaluation_feedback: Optional[EvaluationResult] = None) -> str:
        """Generate physics solution with optional feedback from previous attempts (async)"""
        try:
            content = self._prepare_generation_content(
                problem, images_base_path, previous_attempts, evaluation_feedback
            )
            
            # Use ainvoke for async call
            response = await self.client.ainvoke([
                HumanMessage(content=content)
            ])
            
            return response.content
            
        except Exception as e:
            self.logger.error(f"Generation failed: {e}")
            return f"Generation error: {str(e)}"

class AsyncPhysicsEvaluator:
    """Async implementation of physics solution evaluator"""
    
    def __init__(self, api_key: str, model: str = "gpt-4o", 
                 logger: Optional[logging.Logger] = None):
        # Configure client with timeout for better async behavior
        self.client = ChatOpenAI(
            api_key=api_key, 
            model=model,
            # timeout=120,  # 2 minute timeout
            # max_retries=2
        )
        self.logger = logger or logging.getLogger(__name__)

    def _get_evaluator_system_prompt(self, language: str = "English") -> str:
        """Get system prompt for evaluation"""
        
        if language == "Chinese":
            return """你是一位专业的物理学专家和解答评估师。你的任务是评估从零生成的物理解答质量。

**评估目标：** 决定是否接受生成的解答，或需要重新生成。

**智能接受标准：**
- **绝不接受信心 < 30%** 的解答（必须重新生成）
- **谨慎接受信心 30-50%** 的解答（仅当物理原理正确且无重大错误时）
- **优先接受信心 > 50%** 的解答（即使有小的格式问题）
- **高级问题（L7+）和视觉问题**：信心阈值提高10%

**物理错误分类：**
- 物理定理错误：应用错误的定律/公式
- 条件分析错误：误识别力、边界、系统设置
- 过程理解错误：误解物理现象发展过程
- 计算错误：数学推导中的错误
- 变量关系错误：量之间的错误依赖关系
- 图表分析错误：误读视觉信息
- 边界条件错误：忽略约束/限制

**决策逻辑：**
- 接受：信心 ≥ 50% 且无重大物理错误，或信心 30-50% 且解答基本正确
- 拒绝：信心 < 30%，或有重大物理原理错误，或答案明显不合理

请诚实评估你的信心水平，这直接影响接受/拒绝决定。"""
        
        else:
            return """You are an expert physics specialist and solution evaluator. Your task is to assess the quality of from-scratch generated physics solutions.

**EVALUATION GOAL:** Decide whether to accept the generated solution or regenerate.

**Smart Acceptance Criteria:**
- **NEVER ACCEPT confidence < 30%** solutions (must regenerate)
- **CAUTIOUSLY ACCEPT confidence 30-50%** solutions (only if physics principles correct with no major errors)
- **READILY ACCEPT confidence > 50%** solutions (even with minor formatting issues)
- **Advanced problems (L7+) and vision problems**: Raise confidence thresholds by 10%

**Physics Error Classification:**
- Physics theorem errors: Wrong laws/formulas applied
- Condition analysis errors: Misidentified forces, boundaries, system setup
- Process understanding errors: Misunderstood physical phenomena development
- Calculation errors: Mathematical derivation mistakes
- Variable relationship errors: Wrong dependencies between quantities
- Diagram analysis errors: Misread visual information
- Boundary conditions errors: Ignored constraints/limits

**Decision Logic:**
- REJECT: Confidence < 60% OR any major physics errors
- ACCEPT: Confidence ≥ 60% AND zero major errors

Please honestly assess your confidence level - this directly affects accept/reject decision."""
    
    def _format_evaluation_prompt(self, problem: Dict[str, Any], solution: str) -> str:
        """Format evaluation prompt"""
        language = problem.get("language", "English")
        
        # Get problem details
        question = problem.get("question", "N/A")
        level = problem.get("level", "Unknown")
        category = problem.get("img_category", "Unknown")
        subject_code = problem.get("subject", "")
        caption = problem.get("caption", "")
        sig_figs = problem.get("sig_figs", "")
        
        # Format subject info
        subject_info = ""
        if subject_code and subject_code in PHYSICS_SUBJECTS:
            subject_name = PHYSICS_SUBJECTS[subject_code]
            if language == "Chinese":
                subject_info = f"\n物理学科: {subject_code} ({subject_name})"
            else:
                subject_info = f"\nPhysics Subject: {subject_code} ({subject_name})"
        
        # Format caption info
        caption_info = ""
        if caption:
            if language == "Chinese":
                caption_info = f"\n图像说明: {caption}"
            else:
                caption_info = f"\nImage Caption: {caption}"
        
        if language == "Chinese":
            prompt = f"""**物理问题：**
问题: {question}
难度等级: {level}/10
问题类别: {category}{subject_info}
语言: {language}{caption_info}
有效数字要求: {sig_figs if sig_figs else "未指定"}

**生成的解答：**
{solution}

**评估任务：**
评估这个从零生成的物理解答质量。决定是否接受此解答或需要重新生成。

**考虑因素：**
1. 物理原理应用的正确性（特别是{subject_code}学科原理）
2. 数学推导和计算的准确性
3. 推理过程的逻辑性和完整性
4. 最终答案的合理性（数量级、单位、方向）
5. 如果有图像：视觉信息的正确解释
6. 有效数字要求的遵守（如果指定）
7. 整体解答质量和清晰度

请提供详细的评估结果。"""
        else:
            prompt = f"""**PHYSICS PROBLEM:**
Question: {question}
Level: {level}/10
Category: {category}{subject_info}
Language: {language}{caption_info}
Required Significant Figures: {sig_figs if sig_figs else "Not specified"}

**GENERATED SOLUTION:**
{solution}

**EVALUATION TASK:**
Assess the quality of this from-scratch generated physics solution. Decide whether to accept or regenerate.

**Consider:**
1. Correctness of physics principle applications (especially {subject_code} subject principles)
2. Accuracy of mathematical derivations and calculations
3. Logical consistency and completeness of reasoning
4. Reasonableness of final answer (magnitude, units, direction)
5. If images provided: correct interpretation of visual information
6. Compliance with significant figures requirements (if specified)
7. Overall solution quality and clarity

Please provide detailed evaluation results."""
        
        return prompt
    
    def _prepare_evaluation_content(self, problem: Dict[str, Any], solution: str, images_base_path: str = "") -> List[Dict]:
        """Prepare evaluation content including images"""
        prompt = self._format_evaluation_prompt(problem, solution)
        content = [{"type": "text", "text": prompt}]
        
        # Add images if present
        vision_relevance = problem.get('vision_relevance', '')
        if vision_relevance in ['necessary', 'helpful', 'optional']:
            image_paths = problem.get('image_path', [])
            
            for img_path in image_paths:
                full_path = os.path.join(images_base_path, img_path) if images_base_path else img_path
                
                if os.path.exists(full_path):
                    try:
                        base64_image = self._encode_image_to_base64(full_path)
                        if base64_image:
                            mime_type = self._get_image_mime_type(full_path)
                            content.append({
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{base64_image}",
                                    "detail": "high"
                                }
                            })
                    except Exception as e:
                        self.logger.warning(f"⚠️ Failed to load image {img_path}: {e}")
        return content
    
    def _encode_image_to_base64(self, image_path: str) -> Optional[str]:
        """Encode image to base64"""
        try:
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode('utf-8')
        except Exception as e:
            return None
    
    def _get_image_mime_type(self, file_path: str) -> str:
        """Get MIME type for image"""
        ext = os.path.splitext(file_path)[1].lower()
        mime_types = {
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.gif': 'image/gif',
            '.webp': 'image/webp'
        }
        return mime_types.get(ext, 'image/png')
    
    async def evaluate_solution(self, problem: Dict[str, Any], solution: str, images_base_path: str = "") -> EvaluationResult:
        """Evaluate physics solution (async)"""
        try:
            content = self._prepare_evaluation_content(problem, solution, images_base_path)
            system_prompt = self._get_evaluator_system_prompt(problem.get("language", "English"))
            
            # Use ainvoke for async call
            response = await self.client.with_structured_output(EvaluationResult).ainvoke([
                SystemMessage(content=system_prompt),
                HumanMessage(content=content)
            ])
            
            # Apply confidence and error-based decision logic
            return self._apply_decision_logic(response, problem)
            
        except Exception as e:
            self.logger.error(f"Evaluation failed: {e}")
            # Default to accept to avoid infinite loops
            return EvaluationResult(
                decision=EvaluationDecision.ACCEPT,
                confidence_score=0.5,
                quality_score=0.5,
                physics_errors=PhysicsErrorAnalysis(),
                answer_consistency=True,
                magnitude_reasonable=True,
                feedback_message=f"Evaluation failed: {e}"
            )
    
    def _apply_decision_logic(self, result: EvaluationResult, problem: Dict[str, Any]) -> EvaluationResult:
        """Apply enhanced decision logic with confidence thresholds"""
        original_decision = result.decision
        confidence = result.confidence_score
        level = problem.get("level", 1)
        vision_relevance = problem.get("vision_relevance", "")
        
        # Count major physics errors
        errors = result.physics_errors
        major_errors = (
            len(errors.physics_theorem_errors) +
            len(errors.condition_analysis_errors) +
            len(errors.process_understanding_errors) +
            len(errors.calculation_errors) +
            len(errors.variable_relationship_errors) +
            len(errors.diagram_analysis_errors)
        )
        
        # Adjust thresholds for advanced/vision problems
        confidence_threshold = 0.6
        if level >= 7 or vision_relevance in ['necessary', 'helpful']:
            confidence_threshold = 0.6
        
        # Collect rejection reasons
        rejection_reasons = []
        
        # Check for rejection conditions
        if confidence < confidence_threshold:
            rejection_reasons.append(f"Low confidence ({confidence:.2f} < {confidence_threshold})")
        
        if major_errors > 0:
            rejection_reasons.append(f"{major_errors} major physics error(s)")
        
        # Make decision based on rejection reasons
        if rejection_reasons:
            final_decision = EvaluationDecision.REJECT
            override_reason = " + ".join(rejection_reasons)
        else:
            final_decision = EvaluationDecision.ACCEPT
            override_reason = f"Good confidence ({confidence:.2f}) + no major errors"
        
        # Update result with final decision
        result.decision = final_decision
        
        if override_reason and original_decision != final_decision:
            result.feedback_message += f" [Override: {override_reason}]"
        elif override_reason:
            result.feedback_message += f" [Confirmed: {override_reason}]"
        
        return result

class AsyncFromScratchSolver:
    """Async implementation of from-scratch physics solver with example injection and proper logging"""
    
    def __init__(self, api_key: str, 
                 generator_model: str = "o3-2025-04-16",
                 evaluator_model: str = "gpt-4o",
                 max_iterations: int = 3,
                 images_base_path: str = "",
                 labeled_samples_path: str = "",
                 logger: Optional[logging.Logger] = None,
                 verbose: bool = True):
        
        self.generator = AsyncPhysicsGenerator(api_key, generator_model, labeled_samples_path, logger)
        self.evaluator = AsyncPhysicsEvaluator(api_key=api_key, model=evaluator_model, logger=logger)
        self.max_iterations = max_iterations
        self.images_base_path = images_base_path
        self.logger = logger or logging.getLogger(__name__)
        self.verbose = verbose
    
    async def _generator_step(self, state: SolutionState) -> SolutionState:
        """Generator step with feedback-enhanced regeneration (async)"""
        try:
            # FIXED: Use generation_history length instead of iteration_count
            is_regeneration = len(state["generation_history"]) > 0
            
            if is_regeneration:
                attempt_num = len(state["generation_history"]) + 1
                self.logger.info(f"🔄 Regeneration attempt {attempt_num} with feedback for problem {state['problem'].get('index', 'Unknown')}")
                previous_attempts = state["generation_history"]
                evaluation_feedback = state["evaluation_result"]
            else:
                self.logger.info(f"🆕 Initial generation attempt for problem {state['problem'].get('index', 'Unknown')}")
                previous_attempts = None
                evaluation_feedback = None
            
            # Generate solution with optional feedback
            solution = await self.generator.generate_solution(
                state["problem"], 
                self.images_base_path,
                previous_attempts=previous_attempts,
                evaluation_feedback=evaluation_feedback
            )
            
            state["current_solution"] = solution
            state["generation_history"].append(solution)
            
            # Enhanced logging for regeneration
            if is_regeneration and evaluation_feedback:
                errors = evaluation_feedback.physics_errors
                error_count = (
                    len(errors.physics_theorem_errors) +
                    len(errors.condition_analysis_errors) +
                    len(errors.process_understanding_errors) +
                    len(errors.calculation_errors) +
                    len(errors.variable_relationship_errors) +
                    len(errors.diagram_analysis_errors)
                )
                self.logger.info(f"✅ Regenerated solution addressing {error_count} previous errors")
                self.logger.info(f"   Previous confidence: {evaluation_feedback.confidence_score:.2f}")
            else:
                self.logger.info(f"✅ Generated solution for problem {state['problem'].get('index', 'Unknown')}")
            
        except Exception as e:
            self.logger.error(f"❌ Generation failed: {e}")
            state["current_solution"] = f"Generation error: {str(e)}"
        
        return state
        
    async def _evaluator_step(self, state: SolutionState) -> SolutionState:
        """Evaluator step (async)"""
        if not state["current_solution"]:
            state["evaluation_result"] = EvaluationResult(
                decision=EvaluationDecision.REJECT,
                confidence_score=0.0,
                quality_score=0.0,
                physics_errors=PhysicsErrorAnalysis(),
                answer_consistency=False,
                magnitude_reasonable=False,
                feedback_message="No solution provided"
            )
            return state
        
        try:
            result = await self.evaluator.evaluate_solution(
                state["problem"], 
                state["current_solution"], 
                self.images_base_path
            )
            
            state["evaluation_result"] = result
            
            # Count major errors for reporting
            errors = result.physics_errors
            major_errors = (
                len(errors.physics_theorem_errors) +
                len(errors.condition_analysis_errors) +
                len(errors.process_understanding_errors) +
                len(errors.calculation_errors) +
                len(errors.variable_relationship_errors) +
                len(errors.diagram_analysis_errors)
            )
            
            self.logger.info(f"📊 Evaluation: {result.decision.value.upper()} "
                           f"(Confidence: {result.confidence_score:.2f}, "
                           f"Quality: {result.quality_score:.2f}, "
                           f"Errors: {major_errors})")
            
        except Exception as e:
            self.logger.warning(f"⚠️ Evaluation failed: {e}")
            state["evaluation_result"] = EvaluationResult(
                decision=EvaluationDecision.ACCEPT,
                confidence_score=0.5,
                quality_score=0.5,
                physics_errors=PhysicsErrorAnalysis(),
                answer_consistency=True,
                magnitude_reasonable=True,
                feedback_message=f"Evaluation failed: {e}"
            )
        
        return state
    
    def _routing_logic(self, state: SolutionState) -> str:
        attempts = len(state["generation_history"])
        """Routing logic after evaluation"""
        if attempts >= state["max_iterations"]:
            self.logger.warning(f"⚠️ Max iterations ({state['max_iterations']}) reached for problem {state['problem'].get('index', 'Unknown')}")            
            return "max_iterations"
        
        if state["evaluation_result"] and state["evaluation_result"].decision == EvaluationDecision.ACCEPT:
            return "accept"
        else:
            return "reject"
    
    async def solve_problem(self, problem: Dict[str, Any]) -> Dict[str, Any]:
        """Solve a single problem (async)"""
        initial_state = {
            "problem": problem,
            "current_solution": None,
            "evaluation_result": None,
            "final_solution": None,
            "iteration_count": 0,
            "max_iterations": self.max_iterations,
            "generation_history": [],
            "messages": []
        }
        
        try:
            # Simple async workflow loop
            state = initial_state
            
            while True:
                # Generation step
                state = await self._generator_step(state)
                
                # Evaluation step
                state = await self._evaluator_step(state)
                
                # Routing logic
                decision = self._routing_logic(state)
                
                if decision in ["accept", "max_iterations"]:
                    state["final_solution"] = state["current_solution"]
                    state["iteration_count"] += 1
                    break
                elif decision == "reject":
                    # Continue loop for regeneration
                    continue
            
            result = problem.copy()
            result["prediction"] = state["final_solution"]
            result["iterations_used"] = state["iteration_count"]
            result["total_generations"] = len(state["generation_history"])
            result["generation_history"] = state["generation_history"]
            
            if state["evaluation_result"]:
                eval_result = state["evaluation_result"]
                result["final_decision"] = eval_result.decision.value
                result["confidence_score"] = eval_result.confidence_score
                result["quality_score"] = eval_result.quality_score
                result["evaluation_feedback"] = eval_result.feedback_message
            else:
                result["final_decision"] = "unknown"
            
            return result
            
        except Exception as e:
            self.logger.error(f"❌ Workflow failed for problem {problem.get('index', 'Unknown')}: {e}")
            result = problem.copy()
            result["prediction"] = f"Workflow failed: {str(e)}"
            result["error"] = str(e)
            return result

async def solve_batch_async(solver: AsyncFromScratchSolver, batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Solve a batch of problems asynchronously with timeout protection"""
    tasks = [solver.solve_problem(problem) for problem in batch]
    
    try:
        # Add timeout to prevent hanging batches (10 minutes per batch)
        results = await asyncio.wait_for(
            asyncio.gather(*tasks, return_exceptions=True),
            timeout=12000  # 10 minutes timeout for entire batch
        )
    except asyncio.TimeoutError:
        # Handle batch timeout
        solver.logger.error(f"❌ Batch timed out after 200 minutes")
        # Create timeout results for all problems in batch
        results = []
        for problem in batch:
            timeout_result = problem.copy()
            timeout_result["prediction"] = "Batch timeout error"
            timeout_result["error"] = "Batch processing timed out"
            results.append(timeout_result)
        return results
    
    # Handle exceptions
    processed_results = []
    for i, result in enumerate(results):
        if isinstance(result, Exception):
            # Handle exception by creating error result
            problem = batch[i]
            error_result = problem.copy()
            error_result["prediction"] = f"Async error: {str(result)}"
            error_result["error"] = str(result)
            processed_results.append(error_result)
        else:
            processed_results.append(result)
    
    return processed_results

async def run_async_from_scratch_solving(problems: List[Dict[str, Any]], 
                                  output_file: str = "total_physics_results.json",
                                  images_base_path: str = "",
                                  labeled_samples_path: str = "",
                                  generator_model: str = MODEL_ID,
                                  evaluator_model: str = MODEL_ID,
                                  max_iterations: int = 3,
                                  log_file: str = "total_physics_solver.log",
                                  verbose: bool = True,
                                  console_verbosity: str = "minimal",
                                  batch_size: int = 5) -> List[Dict]:
    """
    Main async function to solve physics problems in batches
    
    Args:
        problems: List of problems to solve
        output_file: Output JSON file path
        images_base_path: Base path for images
        labeled_samples_path: Path to labeled examples JSON
        generator_model: Model for generation
        evaluator_model: Model for evaluation
        max_iterations: Max regeneration attempts
        log_file: Log file path
        verbose: Whether to log to console
        console_verbosity: "full", "minimal", or "silent" console output
        batch_size: Number of problems to process in parallel
    """
    
    # Setup logging
    logger = setup_logging(log_file, verbose and console_verbosity != "silent")
    
    # Check for existing results
    existing_results = []
    processed_indices = set()
    
    if os.path.exists(output_file):
        logger.info(f"📁 Found existing results: {output_file}")
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                existing_results = json.load(f)
            processed_indices = {result.get('index') for result in existing_results}
            logger.info(f"✅ Loaded {len(existing_results)} existing results")
        except Exception as e:
            logger.error(f"⚠️ Error loading existing results: {e}")
            existing_results = []
            processed_indices = set()
    
    # Filter problems to solve
    problems_to_solve = [p for p in problems if p.get('index') not in processed_indices]
    
    logger.info(f"🎯 Problems to solve: {len(problems_to_solve)}")
    logger.info(f"📊 Total: {len(problems)}, Done: {len(processed_indices)}, Remaining: {len(problems_to_solve)}")
    logger.info(f"⚡ Batch size: {batch_size}")
    
    if labeled_samples_path:
        logger.info(f"🔍 Using labeled examples from: {labeled_samples_path}")
    else:
        logger.info("📝 No labeled examples provided - using base prompts only")
    
    if len(problems_to_solve) == 0:
        logger.info("🎉 All problems already processed!")
        return existing_results
    
    # Initialize solver
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY environment variable not set")
    
    solver = AsyncFromScratchSolver(
        api_key=api_key,
        generator_model=generator_model,
        evaluator_model=evaluator_model,
        max_iterations=max_iterations,
        images_base_path=images_base_path,
        labeled_samples_path=labeled_samples_path,
        logger=logger,
        verbose=verbose
    )
    
    # Progress tracking
    new_results = []
    stats = {
        'accepted': 0,
        'rejected': 0,
        'total_iterations': 0,
        'total_generations': 0,
        'confidence_sum': 0,
        'quality_sum': 0,
        'by_subject': {},
        'by_category': {},
        'by_level': {}
    }
    
    # Create batches
    batches = [problems_to_solve[i:i + batch_size] for i in range(0, len(problems_to_solve), batch_size)]
    
    # Use tqdm for progress bar
    progress_bar = tqdm(batches, desc="Processing Batches", 
                       disable=(console_verbosity == "silent"))
    
    batch_number = 0
    for batch in progress_bar:
        batch_number += 1
        
        # Update progress bar description
        progress_bar.set_description(f"Batch {batch_number}/{len(batches)} (Size: {len(batch)})")
        
        logger.info(f"\n🚀 Processing batch {batch_number}/{len(batches)} with {len(batch)} problems")
        
        try:
            # Process batch asynchronously
            batch_results = await solve_batch_async(solver, batch)
            new_results.extend(batch_results)
            
            # Update statistics for this batch
            for result in batch_results:
                decision = result.get('final_decision', 'unknown')
                iterations = result.get('iterations_used', 0)
                total_generations = result.get('total_generations', 0)
                confidence = result.get('confidence_score', 0)
                quality = result.get('quality_score', 0)
                
                if decision == 'accept':
                    stats['accepted'] += 1
                else:
                    stats['rejected'] += 1
                
                stats['total_iterations'] += iterations
                stats['total_generations'] += total_generations
                stats['confidence_sum'] += confidence
                stats['quality_sum'] += quality
                
                # Track by categories
                problem = result  # result contains all problem info
                subject = problem.get('subject', 'Unknown')
                category = problem.get('img_category', 'Unknown')
                level = problem.get('level', 0)
                
                if subject not in stats['by_subject']:
                    stats['by_subject'][subject] = {'accepted': 0, 'total': 0}
                stats['by_subject'][subject]['total'] += 1
                if decision == 'accept':
                    stats['by_subject'][subject]['accepted'] += 1
                
                if category not in stats['by_category']:
                    stats['by_category'][category] = {'accepted': 0, 'total': 0}
                stats['by_category'][category]['total'] += 1
                if decision == 'accept':
                    stats['by_category'][category]['accepted'] += 1
                
                if level not in stats['by_level']:
                    stats['by_level'][level] = {'accepted': 0, 'total': 0}
                stats['by_level'][level]['total'] += 1
                if decision == 'accept':
                    stats['by_level'][level]['accepted'] += 1
            
            # Update progress bar postfix
            if console_verbosity == "minimal":
                current_accept_rate = stats['accepted'] / (stats['accepted'] + stats['rejected']) * 100 if (stats['accepted'] + stats['rejected']) > 0 else 0
                avg_confidence = stats['confidence_sum'] / len(new_results) if len(new_results) > 0 else 0
                progress_bar.set_postfix({
                    'Accept Rate': f"{current_accept_rate:.1f}%",
                    'Avg Conf': f"{avg_confidence:.2f}",
                    'Completed': len(new_results)
                })
            
            logger.info(f"✅ Batch {batch_number} completed: {len(batch_results)} results")
            
            # Save progress after each batch
            combined_results = existing_results + new_results
            combined_results.sort(key=lambda x: x.get('index', 0))
            
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(combined_results, f, indent=2, ensure_ascii=False)
            
            logger.info(f"💾 Progress saved: {len(combined_results)} total results")
            
            # Small delay between batches to prevent rate limiting
            await asyncio.sleep(120)
            
        except asyncio.CancelledError:
            # Handle cancellation gracefully
            logger.warning(f"⚠️ Batch {batch_number} was cancelled")
            break
            
        except Exception as e:
            logger.error(f"❌ Batch {batch_number} failed: {e}")
            # Continue with next batch
            continue
    
    progress_bar.close()
    
    # Final save and analysis
    final_results = existing_results + new_results
    final_results.sort(key=lambda x: x.get('index', 0))
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_results, f, indent=2, ensure_ascii=False)
    
    # Analytics
    total = len(new_results)
    if total > 0:
        accepted = stats['accepted']
        rejected = stats['rejected']
        avg_iterations = stats['total_iterations'] / total
        avg_generations = stats['total_generations'] / total
        avg_confidence = stats['confidence_sum'] / total
        avg_quality = stats['quality_sum'] / total
        
        logger.info(f"\n📊 Async Physics Solver Results:")
        logger.info(f"   Total Problems Processed: {total}")
        logger.info(f"   Accepted: {accepted} ({accepted/total*100:.1f}%)")
        logger.info(f"   Rejected (Max Iterations): {rejected} ({rejected/total*100:.1f}%)")
        logger.info(f"   Average Iterations: {avg_iterations:.1f}")
        logger.info(f"   Average Generations: {avg_generations:.1f}")
        logger.info(f"   Average Confidence: {avg_confidence:.2f}")
        logger.info(f"   Average Quality: {avg_quality:.2f}")
        
        # Subject breakdown
        logger.info(f"\n📋 Performance by Subject:")
        for subject, data in stats['by_subject'].items():
            rate = data['accepted'] / data['total'] * 100 if data['total'] > 0 else 0
            logger.info(f"   {subject}: {data['accepted']}/{data['total']} ({rate:.1f}%)")
        
        # Level breakdown
        logger.info(f"\n📈 Performance by Level:")
        for level in sorted(stats['by_level'].keys()):
            data = stats['by_level'][level]
            rate = data['accepted'] / data['total'] * 100 if data['total'] > 0 else 0
            logger.info(f"   Level {level}: {data['accepted']}/{data['total']} ({rate:.1f}%)")
    
    logger.info(f"\n✅ All results saved to: {output_file}")
    logger.info(f"📝 Detailed logs saved to: {log_file}")
    
    return final_results

# Async function for direct use in async contexts
async def run_async_solver_await(problems: List[Dict[str, Any]], 
                                 output_file: str = "total_physics_results_async.json",
                                 images_base_path: str = "",
                                 labeled_samples_path: str = "",
                                 max_iterations: int = 3,
                                 batch_size: int = 5):
    """
    Async function to run the solver - use with await in async contexts
    """
    return await run_async_from_scratch_solving(
        problems=problems,
        images_base_path=images_base_path,
        output_file=output_file,
        labeled_samples_path=labeled_samples_path,
        max_iterations=max_iterations,
        batch_size=batch_size
    )

# Wrapper function to run async in Jupyter (simplified and more robust)
def run_async_solver(problems: List[Dict[str, Any]], 
                    output_file: str = "total_physics_results_async.json",
                    images_base_path: str = "",
                    labeled_samples_path: str = "",
                    max_iterations: int = 3,
                    batch_size: int = 5):
    """
    Wrapper function to run the async solver in Jupyter notebooks
    Works with nest_asyncio for Jupyter compatibility
    """
    # With nest_asyncio applied, we can use asyncio.run even in Jupyter
    return asyncio.run(run_async_from_scratch_solving(
        problems=problems,
        images_base_path=images_base_path,
        output_file=output_file,
        labeled_samples_path=labeled_samples_path,
        max_iterations=max_iterations,
        batch_size=batch_size
    ))

In [11]:
# Example usage
if __name__ == "__main__":
    # Load your problems
    problems = load_public_data()
    
    # Run the async solver
    results = run_async_solver(
        problems=problems,
        images_base_path="///mnt/c/Personal/Competitions/ICML_Track2/input/starting_kit_latest/",
        output_file="total_physics_results_o3.json",
        labeled_samples_path="///mnt/c/Personal/Competitions/ICML_Track2/input/starting_kit_latest/dev.json",
        max_iterations=3,
        batch_size=25,  # Process 5 problems at a time
    )

2025-06-13 08:11:36,045 - INFO - 📁 Found existing results: total_physics_results_o3.json
2025-06-13 08:11:36,078 - INFO - ✅ Loaded 440 existing results
2025-06-13 08:11:36,079 - INFO - 🎯 Problems to solve: 1560
2025-06-13 08:11:36,080 - INFO - 📊 Total: 2000, Done: 440, Remaining: 1560
2025-06-13 08:11:36,080 - INFO - ⚡ Batch size: 25
2025-06-13 08:11:36,081 - INFO - 🔍 Using labeled examples from: ///mnt/c/Personal/Competitions/ICML_Track2/input/starting_kit_latest/dev.json
2025-06-13 08:11:36,334 - INFO - ✅ Loaded 200 labeled examples


Processing Batches:   0%|          | 0/63 [00:00<?, ?it/s]

2025-06-13 08:11:36,372 - INFO - 
🚀 Processing batch 1/63 with 25 problems
2025-06-13 08:11:36,373 - INFO - 🆕 Initial generation attempt for problem 440
2025-06-13 08:11:36,373 - INFO - ✅ Injecting 3 examples for problem 440.
2025-06-13 08:11:36,373 - INFO -   -> Example 1 (EM/circuit_diagram): Question snippet: 'Fig. 7.1 shows the circuit diagram containing an operational amplifier (op-amp). Calculate the gain of the amplifier.'Answer snippet: '6.0'
2025-06-13 08:11:36,376 - INFO - 🆕 Initial generation attempt for problem 441
2025-06-13 08:11:36,376 - INFO - ✅ Injecting 3 examples for problem 441.
2025-06-13 08:11:36,377 - INFO -   -> Example 1 (CM/static_force_analysis): Question snippet: '将质量为 $m$ 的小球挂在倾角为 $\theta$ 的光滑斜面上，如图所示。当斜面以加速度 $a$ ，沿如图所示的方向运动时，求小球对斜面的正压力。'Answer snippet: '对斜面的正压力为 $F_{\mathrm{N}}=m(g \cos \theta-a \sin \theta)$'
2025-06-13 08:11:36,384 - INFO - 🆕 Initial generation attempt for problem 442
2025-06-13 08:11:36,384 - INFO - ✅ Injecting 3 examples for problem 44