In [None]:
# TIR-Enabled Ensemble with Self-Consistency
# Based on AIMO Progress Prize 1 & 2 winning approaches

# CONTROL: Set to True to test on reference.csv, False to submit
TEST_MODE = True  # Set to False before submission!

import os
import sys

os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import polars as pl
import re
import subprocess
import tempfile
from typing import Optional, List, Tuple
from collections import Counter
import kaggle_evaluation.aimo_3_inference_server

In [None]:
import subprocess
import tempfile
import os

def execute_python_code(code: str, timeout: int = 10) -> tuple[Optional[int], str]:
    """Safely execute Python code and return the result."""
    try:
        # Create a safe execution environment
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
            f.write(code)
            temp_file = f.name
        
        try:
            result = subprocess.run(
                ['python3', temp_file],
                capture_output=True,
                text=True,
                timeout=timeout
            )
            
            output = result.stdout + result.stderr
            
            # Try to extract the last number printed
            lines = output.strip().split('\n')
            for line in reversed(lines):
                # Look for numbers in the output
                numbers = re.findall(r'\b(\d{1,5})\b', line)
                if numbers:
                    try:
                        answer = int(numbers[-1])
                        if 0 <= answer <= 99999:
                            return answer, output
                    except:
                        pass
            
            return None, output
        finally:
            os.unlink(temp_file)
    except Exception as e:
        return None, str(e)

def extract_code_from_response(text: str) -> Optional[str]:
    """Extract Python code blocks from model response."""
    # Look for code blocks
    code_patterns = [
        r'```python\n(.*?)```',
        r'```\n(.*?)```',
    ]
    
    for pattern in code_patterns:
        matches = re.findall(pattern, text, re.DOTALL)
        if matches:
            return matches[-1].strip()
    
    return None

def extract_answer(text: str) -> Optional[int]:
    """Extract numerical answer from model output."""
    patterns = [
        r'\\boxed\{(\d{1,5})\}',
        r'#### (\d{1,5})',
        r'(?:final answer|answer|result|solution)(?:\s+is)?:?\s*(\d{1,5})',
        r'=\s*(\d{1,5})(?:\s|$|\.|,)',
        r'(\d{1,5})(?:\s+(?:is the|as the) answer)',
        r'therefore.*?(\d{1,5})',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
        if matches:
            try:
                answer = int(matches[-1])
                if 0 <= answer <= 99999:
                    return answer
            except ValueError:
                continue
    
    # Try to find any 1-5 digit number near end of text
    last_500 = text[-500:] if len(text) > 500 else text
    numbers = re.findall(r'\b(\d{1,5})\b', last_500)
    if numbers:
        try:
            answer = int(numbers[-1])
            if 0 <= answer <= 99999:
                return answer
        except:
            pass
    
    return None

def validate_answer(answer: int) -> bool:
    return isinstance(answer, int) and 0 <= answer <= 99999

In [None]:
# Removed MathTools and symbolic solver to comply with competition rules
# Competition requires model-based solutions only

In [None]:
import os
import glob

def find_model_path(base_path):
    """Auto-detect the actual model directory within a dataset."""
    if not os.path.exists(base_path):
        return None
    
    if os.path.exists(os.path.join(base_path, 'config.json')):
        return base_path
    
    for root, dirs, files in os.walk(base_path):
        if 'config.json' in files:
            return root
    
    return None


class ModelConfig:
    """Configuration for each model strategy."""
    
    def __init__(self, name: str, kaggle_path: str, hf_path: str, temp: float = 0.7, 
                 top_p: float = 0.9, top_k: int = 50, needs_trust: bool = False, 
                 num_samples: int = 1):
        self.name = name
        self.kaggle_path = kaggle_path
        self.hf_path = hf_path
        self.temp = temp
        self.top_p = top_p
        self.top_k = top_k
        self.needs_trust = needs_trust
        self.num_samples = num_samples
    
    def get_path(self):
        if self.kaggle_path:
            actual_path = find_model_path(self.kaggle_path)
            if actual_path:
                return actual_path
        return self.hf_path


class EnsembleSolver:
    """TIR-enabled solver with multiple sampling and smart voting."""
    
    def __init__(self):
        # ALL 4 models with multiple samples = ~20-24 total samples
        self.model_configs = [
            # Model 1: Qwen2.5-Math-1.5B (fast, 3 samples)
            ModelConfig(
                "Qwen-1.5B",
                '/kaggle/input/qwen2-5-math-1-5b-instruct',
                'Qwen/Qwen2.5-Math-1.5B-Instruct',
                temp=0.5, top_p=0.9, top_k=40, num_samples=3
            ),
            
            # Model 2: Qwen2.5-Math-7B (3 configs, 2 samples each = 6 samples)
            ModelConfig(
                "Qwen-7B-Low",
                '/kaggle/input/qwen2-5-math-7b-instruct',
                'Qwen/Qwen2.5-Math-7B-Instruct',
                temp=0.4, top_p=0.88, top_k=35, num_samples=2
            ),
            ModelConfig(
                "Qwen-7B-Mid",
                '/kaggle/input/qwen2-5-math-7b-instruct',
                'Qwen/Qwen2.5-Math-7B-Instruct',
                temp=0.6, top_p=0.9, top_k=45, num_samples=2
            ),
            ModelConfig(
                "Qwen-7B-High",
                '/kaggle/input/qwen2-5-math-7b-instruct',
                'Qwen/Qwen2.5-Math-7B-Instruct',
                temp=0.8, top_p=0.95, top_k=60, num_samples=2
            ),
            
            # Model 3: DeepSeek-Math-7B-RL (3 configs, 2 samples each = 6 samples)
            ModelConfig(
                "DeepSeek-Low",
                '/kaggle/input/deepseek-math-7b-rl',
                'deepseek-ai/deepseek-math-7b-rl',
                temp=0.3, top_p=0.85, top_k=30, num_samples=2,
                needs_trust=True
            ),
            ModelConfig(
                "DeepSeek-Mid",
                '/kaggle/input/deepseek-math-7b-rl',
                'deepseek-ai/deepseek-math-7b-rl',
                temp=0.6, top_p=0.9, top_k=45, num_samples=2,
                needs_trust=True
            ),
            ModelConfig(
                "DeepSeek-High",
                '/kaggle/input/deepseek-math-7b-rl',
                'deepseek-ai/deepseek-math-7b-rl',
                temp=0.8, top_p=0.95, top_k=55, num_samples=2,
                needs_trust=True
            ),
            
            # Model 4: MAmmoTH-7B (3 samples)
            ModelConfig(
                "MAmmoTH-7B",
                '/kaggle/input/mammoth-7b-mistral',
                'TIGER-Lab/MAmmoTH-7B-Mistral',
                temp=0.6, top_p=0.9, top_k=45, num_samples=3,
                needs_trust=True
            ),
        ]
    
    def solve_with_model(self, problem: str, model_config: ModelConfig) -> List[int]:
        """Solve using one model, generate multiple samples."""
        try:
            import torch
            from transformers import AutoTokenizer, AutoModelForCausalLM
            import gc
            
            model_path = model_config.get_path()
            
            if not model_path:
                print(f"    [{model_config.name}] Not found")
                return []
            
            print(f"  [{model_config.name}] Loading ({model_config.num_samples} samples)...")
            
            try:
                tokenizer = AutoTokenizer.from_pretrained(
                    model_path, 
                    use_fast=True,
                    trust_remote_code=model_config.needs_trust
                )
                
                model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    device_map="auto" if torch.cuda.is_available() else None,
                    trust_remote_code=model_config.needs_trust,
                    low_cpu_mem_usage=True
                )
            except Exception as e:
                print(f"    Load failed: {str(e)[:80]}")
                return []
            
            if torch.cuda.is_available():
                model.eval()
            
            answers = []
            
            # Generate multiple samples
            for sample_idx in range(model_config.num_samples):
                try:
                    # Tool-Integrated Reasoning prompt (like AIMO winners)
                    prompt = f"""Solve this International Mathematical Olympiad problem step by step.

Problem:
{problem}

Instructions:
- Think through the problem carefully
- You can write Python code in ```python blocks to help solve it
- Show your reasoning and calculations
- The final answer must be an integer between 0 and 99999
- Write your final answer as: \\boxed{{answer}}

Solution:"""

                    # Try to use chat template if available, otherwise use plain text
                    if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template is not None:
                        messages = [{"role": "user", "content": prompt}]
                        text = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True
                        )
                    else:
                        text = prompt

                    inputs = tokenizer([text], return_tensors="pt")
                    if hasattr(model, 'device'):
                        inputs = inputs.to(model.device)

                    with torch.no_grad():
                        outputs = model.generate(
                            **inputs,
                            max_new_tokens=2048,  # More tokens for better reasoning
                            temperature=model_config.temp,
                            do_sample=True,
                            top_p=model_config.top_p,
                            top_k=model_config.top_k,
                            pad_token_id=tokenizer.eos_token_id,
                        )

                    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
                    
                    # Try Tool-Integrated Reasoning first
                    code = extract_code_from_response(response)
                    answer = None
                    
                    if code:
                        # Execute the code
                        code_answer, code_output = execute_python_code(code)
                        if code_answer is not None:
                            answer = code_answer
                    
                    # Fallback to text extraction
                    if answer is None:
                        answer = extract_answer(response)
                    
                    if answer is not None and validate_answer(answer):
                        answers.append(answer)
                
                except Exception as e:
                    pass  # Skip failed samples
            
            # Cleanup
            del model
            del tokenizer
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            gc.collect()
            
            if answers:
                print(f"    → Got {len(answers)}/{model_config.num_samples} answers: {answers}")
            else:
                print(f"    → No valid answers")
            
            return answers
            
        except Exception as e:
            print(f"  Error: {str(e)[:80]}")
            return []
    
    def smart_voting(self, all_answers: List[int]) -> Optional[int]:
        """
        Smarter voting strategy:
        1. Simple majority for clear consensus
        2. If no majority, prefer answers from larger models
        3. Ignore extreme outliers (99999, very large numbers for small expected answers)
        """
        if not all_answers:
            return None
        
        vote_counts = Counter(all_answers)
        total_votes = len(all_answers)
        
        # Get top 3 candidates
        top_candidates = vote_counts.most_common(3)
        best_answer, best_count = top_candidates[0]
        
        # Strong consensus (>40% agree) - just use it
        if best_count >= total_votes * 0.4:
            return best_answer
        
        # Weak consensus - apply heuristics
        # Remove obvious outliers (99999, 0 if it has weak support)
        filtered_answers = []
        for ans in all_answers:
            # Keep 99999 only if it has strong support
            if ans == 99999 and vote_counts[99999] < total_votes * 0.3:
                continue
            # Keep 0 only if it has some support
            if ans == 0 and vote_counts[0] < 2:
                continue
            filtered_answers.append(ans)
        
        if filtered_answers:
            filtered_counts = Counter(filtered_answers)
            return filtered_counts.most_common(1)[0][0]
        
        # Fallback to simple majority
        return best_answer
    
    def solve_with_ensemble(self, problem: str) -> Optional[int]:
        """Run all models with multiple samples and vote."""
        all_answers = []
        
        print(f"  Running TIR ensemble with self-consistency...")
        
        for model_config in self.model_configs:
            answers = self.solve_with_model(problem, model_config)
            all_answers.extend(answers)
        
        if not all_answers:
            return None
        
        # Smart voting
        best_answer = self.smart_voting(all_answers)
        vote_counts = Counter(all_answers)
        
        print(f"  Total samples: {len(all_answers)}")
        print(f"  Vote distribution: {dict(vote_counts.most_common(5))}")
        print(f"  Smart voting result: {best_answer}")
        
        return best_answer
    
    def solve_problem(self, problem_id: str, problem_text: str) -> int:
        answer = self.solve_with_ensemble(problem_text)
        
        if answer is None:
            answer = 0
        
        if not validate_answer(answer):
            answer = abs(answer) % 100000
        
        return answer

solver = EnsembleSolver()

In [None]:
print("=" * 70)
print("TIR-ENABLED ENSEMBLE with Smart Voting")
print("=" * 70)
print("Based on AIMO Prize 1 & 2 winning solutions:")
print("  ✓ Tool-Integrated Reasoning (Python code execution)")
print("  ✓ Multiple samples per model (self-consistency)")
print("  ✓ Smart voting (filters outliers, requires consensus)")
print("  ✓ ~20 total samples per problem")
print("")
print("Models (4 different models, 8 configs):")
print("  • Qwen2.5-Math-1.5B (3 samples)")
print("  • Qwen2.5-Math-7B (6 samples: 3 temps × 2 each)")
print("  • DeepSeek-Math-7B-RL (6 samples: 3 temps × 2 each)")
print("  • MAmmoTH-7B-Mistral (3 samples)")
print("=" * 70)

if TEST_MODE:
    # Testing mode - run on reference.csv to see what models do
    print("\n[TEST MODE] Running on reference.csv...")
    print("(Set TEST_MODE = False in cell 0 to submit)")
    reference_path = '/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv'
    if not os.path.exists(reference_path):
        reference_path = '../data/reference.csv'

    if os.path.exists(reference_path):
        try:
            ref_df = pd.read_csv(reference_path)
            correct = 0
            total = len(ref_df)

            for idx, row in ref_df.iterrows():
                problem_id = row['id']
                problem_text = row['problem']
                expected_answer = int(row['answer'])

                print(f"\n{'='*70}")
                print(f"[{idx+1}/{total}] Problem {problem_id}")
                print(f"Expected: {expected_answer}")

                predicted_answer = solver.solve_problem(problem_id, problem_text)
                print(f"Final: {predicted_answer}")

                if predicted_answer == expected_answer:
                    print("✓ CORRECT")
                    correct += 1
                else:
                    print("✗ INCORRECT")

            accuracy = (correct / total * 100) if total > 0 else 0
            print("\n" + "=" * 70)
            print(f"SCORE: {correct}/{total} ({accuracy:.1f}%)")
            print("=" * 70)
        except Exception as e:
            print(f"Error testing reference.csv: {e}")
    
    # Create dummy submission for Kaggle validation
    print("\nCreating dummy submission.parquet for Kaggle validation...")
    dummy_submission = pl.DataFrame({
        "id": ["dummy"],
        "answer": [0]
    })
    dummy_submission.write_parquet("submission.parquet")
    print("✓ submission.parquet created")
    
else:
    # Submission mode - create inference server and serve
    print("\n[SUBMISSION MODE] Starting inference server...")
    print("(Set TEST_MODE = True in cell 0 to test locally)")
    
    def predict(id_: pl.Series, problem: pl.Series) -> pl.DataFrame:
        try:
            question_id = id_.item(0)
            question_text = problem.item(0)
            answer = solver.solve_problem(question_id, question_text)
            return pl.DataFrame({"id": [question_id], "answer": [answer]})
        except Exception as e:
            print(f"Prediction error: {e}")
            return pl.DataFrame({"id": [id_.item(0)], "answer": [0]})
    
    inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)
    inference_server.serve()