# ARC Prize 2025 Submission: AI Civilization (Offline Inference)

**Strategy:** Multi-agent evolution with local Code Gemma 7B (no internet access)

**Components:**
- **Analyst**: Analyzes ARC tasks to infer transformation patterns
- **Programmer**: Generates solver code based on analysis
- **SimplifiedEvolution**: Minimal evolution loop for time constraints

**Constraints:**
- No internet access (offline inference only)
- 12-hour runtime for 240 tasks (~3 min/task)
- pass@2 format (2 diverse attempts per test input)

**Requirements:**
- Python 3.9+ (uses modern type hints: `list[dict]`, `tuple[bool, ...]`)
- Kaggle GPU: L4x4 (96GB VRAM) for Code Gemma 7B
- Code Gemma 7B model uploaded as Kaggle dataset (~16GB)

In [None]:
# Cell 1: Environment Setup
import json
import multiprocessing
import re
import time

import numpy as np

# Only import transformers if running on Kaggle (has GPU)
# For local testing without transformers, we'll use a mock
try:
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    HAS_TRANSFORMERS = True
except ImportError:
    print("Warning: transformers not available (local testing mode)")
    HAS_TRANSFORMERS = False

print("Environment setup complete!")
print(f"Transformers available: {HAS_TRANSFORMERS}")
print(f"NumPy version: {np.__version__}")

In [None]:
# Cell 2: Load Local Model (No Internet!)

# Path to Code Gemma model (uploaded as Kaggle dataset)
MODEL_PATH = "/kaggle/input/codegemma-7b/"  # On Kaggle
# MODEL_PATH = "./models/codegemma-7b/"  # For local testing

model = None
tokenizer = None

if HAS_TRANSFORMERS:
    print("Loading Code Gemma 7B model...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            device_map="auto",
            torch_dtype=torch.float16,  # Memory optimization
        )
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        print(f"✅ Model loaded successfully! Device: {model.device}")
    except Exception as e:
        print(f"❌ ERROR: Model loading failed: {e}")
        print("This will cause the notebook to fail. Please check:")
        print("1. Model files exist in Kaggle dataset")
        print("2. Sufficient disk space (~16GB)")
        print("3. Dataset path is correct")
        raise  # Don't continue with mock model on Kaggle
else:
    print("⚠️  WARNING: Running in LOCAL TESTING mode with mock responses")
    print("This is for validation only - Kaggle submission requires the real model")


def generate_with_local_model(
    prompt: str, temperature: float = 0.3, max_tokens: int = 2048
) -> str:
    """
    Local inference - completely offline (no API calls).

    Args:
        prompt: Input prompt for the model
        temperature: Sampling temperature (higher = more creative)
        max_tokens: Maximum tokens to generate

    Returns:
        Generated text from the model
    """
    if not HAS_TRANSFORMERS or model is None:
        # Mock response for local testing
        return "def solve(task_grid: np.ndarray) -> np.ndarray:\n    return task_grid"

    inputs = tokenizer(
        prompt, return_tensors="pt", truncation=True, max_length=4096
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        temperature=temperature,
        max_new_tokens=max_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


print("Model inference function ready!")

In [None]:
# Cell 3: Core Helper Functions (Duplicated from src/ for offline compatibility)


def format_grid(grid: list[list[int]]) -> str:
    """
    Convert grid to ASCII art for LLM prompt.

    Example:
        [[0, 1], [2, 3]] -> "0 1\n2 3"
    """
    return "\n".join(" ".join(str(cell) for cell in row) for row in grid)


def format_examples(examples: list[dict]) -> str:
    """
    Format training examples for LLM prompt.

    Args:
        examples: List of {"input": grid, "output": grid} dicts

    Returns:
        Formatted string with all examples
    """
    formatted = []
    for i, ex in enumerate(examples):
        formatted.append(f"Example {i + 1}:")
        formatted.append(f"Input:\n{format_grid(ex['input'])}")
        formatted.append(f"Output:\n{format_grid(ex['output'])}")
        formatted.append("")
    return "\n".join(formatted)


def extract_solve_function(llm_response: str) -> str:
    """
    Extract solve() function from LLM output.
    Handles markdown code blocks and plain code.

    Args:
        llm_response: Raw LLM output

    Returns:
        Extracted Python code
    """
    # Try to extract from markdown code block first
    code_block_pattern = r"```python\n(.+?)```"
    match = re.search(code_block_pattern, llm_response, re.DOTALL)
    if match:
        return match.group(1).strip()

    # Try without language specifier
    code_block_pattern = r"```\n(.+?)```"
    match = re.search(code_block_pattern, llm_response, re.DOTALL)
    if match:
        return match.group(1).strip()

    # If no code block, try to find def solve(
    if "def solve(" in llm_response:
        # Extract from def solve( to end
        start_idx = llm_response.find("def solve(")
        return llm_response[start_idx:].strip()

    # Return as-is (might be pure code)
    return llm_response.strip()


def execute_solver_safe(
    code: str, input_grid: np.ndarray, timeout: int = 5
) -> tuple[bool, np.ndarray | None, dict | None]:
    """
    Execute solver code with timeout and error handling.
    Uses multiprocessing for isolation.

    SECURITY NOTE: This sandbox provides:
    - Process isolation (timeout, crash protection)
    - Restricted builtins (no eval, exec, compile, open, __import__)

    LIMITATIONS (acceptable for Kaggle's isolated environment):
    - Cannot prevent filesystem access via numpy/ctypes
    - Cannot prevent network access if libraries are available
    - Kaggle provides container-level isolation for these

    For production use outside Kaggle, use Docker sandbox instead.

    Args:
        code: Solver code string
        input_grid: Input grid to transform
        timeout: Timeout in seconds

    Returns:
        (success, result_grid, error_detail)
    """

    def _run_solver(code_str, task_grid, result_queue):
        """Worker function for multiprocess execution"""
        try:
            # Create restricted builtins (remove dangerous functions)
            # Per CLAUDE.md: "Restricted builtins: eval, exec, compile, open removed"
            safe_builtins = {
                k: v
                for k, v in __builtins__.items()
                if k not in ["eval", "exec", "compile", "open", "__import__"]
            }

            # Create restricted namespace
            namespace = {
                "__builtins__": safe_builtins,
                "np": np,
                "task_grid": task_grid,
            }

            # Execute code with restricted namespace
            exec(code_str, namespace)  # noqa: S102

            # Call solve function
            if "solve" not in namespace:
                result_queue.put(
                    (
                        False,
                        None,
                        {
                            "error_type": "missing_function",
                            "error_message": "No solve() function found",
                        },
                    )
                )
                return

            result = namespace["solve"](task_grid)

            # Validate result
            if not isinstance(result, np.ndarray):
                result_queue.put(
                    (
                        False,
                        None,
                        {
                            "error_type": "invalid_return",
                            "error_message": f"Expected np.ndarray, got {type(result)}",
                        },
                    )
                )
                return

            result_queue.put((True, result, None))

        except Exception as e:
            result_queue.put(
                (False, None, {"error_type": type(e).__name__, "error_message": str(e)})
            )

    # Run in separate process with resource cleanup
    result_queue = multiprocessing.Queue()
    process = multiprocessing.Process(
        target=_run_solver, args=(code, input_grid, result_queue)
    )

    try:
        process.start()
        process.join(timeout=timeout)

        if process.is_alive():
            # Timeout - terminate process
            process.terminate()
            process.join()
            return (
                False,
                None,
                {
                    "error_type": "timeout",
                    "error_message": f"Execution exceeded {timeout}s",
                },
            )

        if result_queue.empty():
            return (
                False,
                None,
                {
                    "error_type": "unknown",
                    "error_message": "Process terminated without result",
                },
            )

        return result_queue.get()

    finally:
        # Clean up resources
        result_queue.close()
        result_queue.join_thread()


def calculate_fitness(code: str, task: dict) -> dict:
    """
    Calculate fitness score for solver code.
    Fitness = (train_correct * 1) + (test_correct * 10)

    Args:
        code: Solver code string
        task: Task dict with "train" and "test" examples

    Returns:
        {"fitness": score, "train_correct": count, "test_correct": count}
    """
    train_correct = 0
    test_correct = 0

    # Evaluate on training examples
    for example in task.get("train", []):
        input_grid = np.array(example["input"], dtype=np.int64)
        expected_output = np.array(example["output"], dtype=np.int64)

        success, result, _ = execute_solver_safe(code, input_grid)

        if success and np.array_equal(result, expected_output):
            train_correct += 1

    # Evaluate on test examples (if available)
    for example in task.get("test", []):
        if "output" not in example:
            continue  # No ground truth for this test

        input_grid = np.array(example["input"], dtype=np.int64)
        expected_output = np.array(example["output"], dtype=np.int64)

        success, result, _ = execute_solver_safe(code, input_grid)

        if success and np.array_equal(result, expected_output):
            test_correct += 1

    fitness = (train_correct * 1) + (test_correct * 10)

    return {
        "fitness": fitness,
        "train_correct": train_correct,
        "test_correct": test_correct,
    }


print("Helper functions ready!")

In [None]:
# Cell 4: Simplified AI Agents (Offline versions)


class OfflineAnalyst:
    """
    Simplified Analyst for Kaggle (no API, uses local model).
    Analyzes ARC tasks to infer transformation patterns.
    """

    def analyze_task(self, task_json: dict) -> str:
        """
        Analyze task and generate natural language specification.

        Args:
            task_json: Task dict with "train" examples

        Returns:
            Analysis string with pattern description
        """
        prompt = f"""Analyze this ARC puzzle and describe the transformation pattern.

Training examples:
{format_examples(task_json["train"])}

Provide a concise analysis:
1. PATTERN: One sentence describing the transformation rule
2. OBSERVATIONS: Key features (colors, shapes, positions)
3. APPROACH: High-level implementation strategy

Be specific and actionable."""

        return generate_with_local_model(prompt, temperature=0.3)


class OfflineProgrammer:
    """
    Simplified Programmer for Kaggle.
    Generates solver code based on task examples and optional analysis.
    """

    def generate_solver(self, task_json: dict, analyst_spec: str = None) -> str:
        """
        Generate solver code for the task.

        Args:
            task_json: Task dict with "train" examples
            analyst_spec: Optional analysis from Analyst

        Returns:
            Python code string with solve() function
        """
        analysis_section = (
            f"\n\nAnalyst's observations:\n{analyst_spec}" if analyst_spec else ""
        )

        prompt = f"""Generate a Python function to solve this ARC puzzle.{analysis_section}

Training examples:
{format_examples(task_json["train"])}

Requirements:
- Function signature: def solve(task_grid: np.ndarray) -> np.ndarray:
- Use only numpy for array operations (imported as np)
- Return the transformed grid as np.ndarray
- Grid values are integers 0-9 (colors)

Generate ONLY the solve() function code (no explanations):"""

        response = generate_with_local_model(prompt, temperature=0.4, max_tokens=1024)
        return extract_solve_function(response)


class SimplifiedEvolution:
    """
    Minimal evolution loop for Kaggle time constraints.
    Simplified version of PopulationEvolution for offline inference.
    """

    def __init__(self, population_size: int = 5, max_generations: int = 3):
        """
        Args:
            population_size: Number of solvers to generate
            max_generations: Number of evolution iterations (unused in simplified version)
        """
        self.population_size = population_size
        self.max_generations = max_generations
        self.analyst = OfflineAnalyst()
        self.programmer = OfflineProgrammer()

    def evolve(self, task: dict) -> list[dict]:
        """
        Evolve population of solvers for the task.

        Args:
            task: Task dict with "train" and "test" examples

        Returns:
            List of solver dicts sorted by fitness (best first)
        """
        # 1. Analyst analyzes task
        analysis = self.analyst.analyze_task(task)

        # 2. Generate initial population
        population = []
        for i in range(self.population_size):
            try:
                code = self.programmer.generate_solver(task, analysis)
                fitness_result = calculate_fitness(code, task)
                population.append(
                    {
                        "code": code,
                        "fitness": fitness_result["fitness"],
                        "train_correct": fitness_result["train_correct"],
                        "test_correct": fitness_result["test_correct"],
                    }
                )
            except Exception as e:
                # Skip failed generations
                print(f"Warning: Failed to generate solver {i}: {e}")
                continue

        # 3. Sort by fitness (best first)
        population.sort(key=lambda x: x["fitness"], reverse=True)

        return population


print("AI agents ready!")

In [None]:
# Cell 5: Load Test Data and Run Inference

# Path to test data (provided by Kaggle)
TEST_DATA_PATH = "/kaggle/input/arc-prize-2025/arc-agi_test_challenges.json"
# TEST_DATA_PATH = "../data/arc-prize-2025/arc-agi_evaluation_challenges_merged.json"  # For local testing

print(f"Loading test tasks from: {TEST_DATA_PATH}")

try:
    with open(TEST_DATA_PATH) as f:
        test_tasks = json.load(f)
    print(f"Loaded {len(test_tasks)} test tasks")
except FileNotFoundError:
    print("Warning: Test data not found (using mock for local testing)")
    test_tasks = {}

# Initialize evolution engine
evolution_engine = SimplifiedEvolution(
    population_size=5,  # Small population for time constraints
    max_generations=3,  # Few generations for speed
)

# Track progress
submission = {}
start_time = time.time()

print(f"Starting inference on {len(test_tasks)} tasks...")
print(
    f"Estimated time: {len(test_tasks) * 3} minutes ({len(test_tasks) * 3 / 60:.1f} hours)"
)

for idx, (task_id, task) in enumerate(test_tasks.items()):
    task_start = time.time()

    print(f"\nProcessing {idx + 1}/{len(test_tasks)}: {task_id}")

    try:
        # Evolve population
        population = evolution_engine.evolve(task)

        # Select best 2 for pass@2
        if len(population) == 0:
            # Fallback: use dummy solver
            best = {
                "code": "def solve(task_grid: np.ndarray) -> np.ndarray:\n    return task_grid"
            }
            second = best
        elif len(population) == 1:
            best = population[0]
            second = best
        else:
            best = population[0]
            second = population[1]

        # Generate predictions for all test inputs
        predictions = []
        for test_input in task.get("test", []):
            input_grid = np.array(test_input["input"], dtype=np.int64)

            # Attempt 1 (best solver) - use input_grid as fallback for correct dimensions
            success1, pred1, _ = execute_solver_safe(
                best["code"], input_grid, timeout=3
            )
            attempt_1 = pred1.tolist() if success1 else input_grid.tolist()

            # Attempt 2 (second-best solver) - use input_grid as fallback for correct dimensions
            success2, pred2, _ = execute_solver_safe(
                second["code"], input_grid, timeout=3
            )
            attempt_2 = pred2.tolist() if success2 else input_grid.tolist()

            predictions.append({"attempt_1": attempt_1, "attempt_2": attempt_2})

        submission[task_id] = predictions

        task_time = time.time() - task_start
        print(
            f"  Completed in {task_time:.1f}s (best fitness: {best.get('fitness', 0)})"
        )

    except Exception as e:
        print(f"  ERROR: {e}")
        # Fallback: use input grids to ensure correct dimensions
        predictions = [
            {
                "attempt_1": test_input["input"],
                "attempt_2": test_input["input"],
            }
            for test_input in task.get("test", [])
        ]
        submission[task_id] = predictions

    # Progress update every 10 tasks
    if (idx + 1) % 10 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / (idx + 1)
        remaining = avg_time * (len(test_tasks) - idx - 1)
        print(f"\nProgress: {idx + 1}/{len(test_tasks)} tasks")
        print(
            f"Elapsed: {elapsed / 60:.1f} min | Avg: {avg_time:.1f}s/task | ETA: {remaining / 60:.1f} min"
        )

total_time = time.time() - start_time
print("\nInference complete!")
print(f"Total time: {total_time / 60:.1f} minutes ({total_time / 3600:.2f} hours)")
print(f"Average: {total_time / len(test_tasks):.1f} seconds/task")

In [None]:
# Cell 6: Save Submission

OUTPUT_PATH = "submission.json"

print(f"Saving submission to: {OUTPUT_PATH}")

with open(OUTPUT_PATH, "w") as f:
    json.dump(submission, f, indent=2)

print("Submission saved successfully!")
print(f"Tasks: {len(submission)}")
print("Format: pass@2 (2 attempts per test input)")

# Validate submission format
valid = True
for task_id, predictions in submission.items():
    if not isinstance(predictions, list):
        print(f"ERROR: {task_id} has invalid predictions type")
        valid = False
        continue

    for pred in predictions:
        if not isinstance(pred, dict):
            print(f"ERROR: {task_id} has invalid prediction dict")
            valid = False
            break

        if "attempt_1" not in pred or "attempt_2" not in pred:
            print(f"ERROR: {task_id} missing attempt_1 or attempt_2")
            valid = False
            break

if valid:
    print("\n✅ Submission format validated successfully!")
else:
    print("\n❌ Submission format validation FAILED!")

print("\nReady for Kaggle submission!")