In [1]:
"""
Main pipeline for ARC task solving using execution-based similarity.

Pipeline: Program Similarity ‚Üí Pattern Discovery ‚Üí Code Generation
"""

import re
import json
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass
from pathlib import Path
import os
import sys
from src.llm_client import LLMArguments, LLMClient
# print("Script starting...", flush=True)
# sys.stdout.flush()

from src.vlm_prompter import VLMPrompter
from src.vlm_client import VLMConfig, create_client, BaseVLMClient
from src.utils.library import ProgramLibrary, calculate_grid_similarity
from src.utils.dsl import *
from src.utils.constants import *
# from utils.render_legacy import grid_to_base64_png_oai_content



In [2]:
llm_args = LLMArguments()
llm_args.temperature = 0.7
llm_args.top_k = 20
llm_args.top_p = 0.8
llm_client = LLMClient(llm_args)

python -m sglang.launch_server --model-path /home/flowers/work/hf/Qwen3-4B-Instruct-2507 --tp 1 --port 30014 --mem-fraction-static 0.9 --random-seed 42 --host 0.0.0.0 --log-level info --trust-remote-code --quantization fp8 --context-length 50000 
check server run 0
[2025-11-07 16:08:35] INFO trace.py:52: opentelemetry package is not installed, tracing disabled
[2025-11-07 16:08:35] Using default HuggingFace chat template with detected content format: string
[2025-11-07 16:08:39] INFO trace.py:52: opentelemetry package is not installed, tracing disabled
[2025-11-07 16:08:39] INFO trace.py:52: opentelemetry package is not installed, tracing disabled
[2025-11-07 16:08:39] Init torch distributed begin.
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is co

In [4]:
llm_client.generate(["Hello, world!"])  # Warm up the LLM client

send 1 / 1 messages
[2025-11-07 16:10:07] Prefill batch, #new-seq: 1, #new-token: 12, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0, 
[2025-11-07 16:10:07] INFO:     127.0.0.1:50668 - "POST /v1/chat/completions HTTP/1.1" 200 OK


[["Hello! üëã It's great to meet you. How can I assist you today? üòä"]]

In [16]:
@dataclass
class TaskResult:
    """Result of attempting to solve a task"""
    task_id: str
    success: bool
    score: float
    program: Optional[str] = None
    phase1_output: Optional[str] = None
    phase2_output: Optional[str] = None
    error: Optional[str] = None


def test_program(program_code: str, task: Dict) -> Tuple[float, List[Tuple[Any, Any, bool]]]:
    """
    Test a program against task training examples.
    
    Returns:
        - Average score across examples
        - List of (expected_output, actual_output, passed) tuples
    """
    namespace = globals().copy()
    
    try:
        exec(program_code, namespace)
        
        if 'solve' not in namespace:
            return 0.0, []
        
        solve_fn = namespace['solve']
        scores = []
        results = []
        
        for example in task['test']:
            inp = example['input']
            expected = example['output']
            if isinstance(inp, list):
                inp = tuple(tuple(row) for row in inp)
            try:
                actual = solve_fn(inp)
                score = calculate_grid_similarity(actual, expected)
                scores.append(score)
                results.append((expected, actual, score == 1.0))
            except Exception as e:
                scores.append(0.0)
                results.append((expected, None, False))
        
        avg_score = sum(scores) / len(scores) if scores else 0.0
        return avg_score, results
        
    except Exception as e:
        return 0.0, []


def extract_code_from_response(response: str) -> Optional[str]:
    """Extract Python code from LLM response."""
    try:
        python_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
        
        if python_blocks:
            for block in python_blocks:
                if 'def solve' in block:
                    return block.strip()
            return python_blocks[0].strip()
        
        match = re.search(r'(def solve\(I\):.*?)(?=\n\ndef|\n\nif __name__|$)', response, re.DOTALL)
        if match:
            return match.group(1).strip()
        
        return None
    except Exception as e:
        return None

def get_similar_programs(library: ProgramLibrary,
    task: Dict,
    n_workers: int = None,
    timeout: int = 2
) -> List[Dict]:
        
    similar_programs = library.find_similar(
        train_examples=task['train'],
        top_k=5,
        min_similarity=0.1,
        n_workers=n_workers,
        timeout=timeout
    )
    return similar_programs
    
def get_prompt_1(
    task: Dict,
    task_id: str,
    prompter: VLMPrompter,
    similar_programs: List[Dict] = None,
    verbose: bool = True,
    log_dir: str = "logs"
) -> str:
    
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    
    if verbose:
        print(f"\n{'='*80}", flush=True)
        print(f"Solving Task: {task_id}", flush=True)
        print(f"{'='*80}", flush=True)
    
    # ====================================================================
    # STEP 1: Find Similar Programs by Execution (PARALLELIZED)
    # ====================================================================
    if verbose:
        print("\nüîç Finding similar programs by execution...", flush=True)
    

    if verbose:
        if similar_programs:
            print(f"   Found {len(similar_programs)} similar programs:", flush=True)
            for i, prog in enumerate(similar_programs[:3], 1):
                print(f"   {i}. Task {prog['task_id']}: {prog['similarity']:.2f}", flush=True)
        else:
            print("   No similar programs found", flush=True)
    
    # ====================================================================
    # TEST LIBRARY PROGRAMS: Try existing solutions first
    # ====================================================================
    best_library_score = 0.0
    best_library_program = None
    
    if similar_programs:
        best_match = similar_programs[0]
        best_library_score = best_match['similarity']
        best_library_program = best_match['program']
        if verbose:
            print(f"\n‚úì Best library match: Task {best_match['task_id']} ({best_library_score:.2f})", flush=True)
        
        if best_library_score == 1.0:
            if verbose:
                print(f"   ‚Üí Perfect match found! Using library solution.", flush=True)
            return TaskResult(
                task_id=task_id,
                success=True,
                score=1.0,
                program=best_library_program
            )
            
    # ====================================================================
    # PHASE 1: Pattern Discovery (Natural Language)
    # ====================================================================
    if verbose:
        print("\nüìù Phase 1: Pattern Discovery...", flush=True)
    
    # Pass the raw similar_programs list - prompter will format it
    phase1_prompt = prompter.build_phase1_prompt(task, similar_programs)
    return phase1_prompt


def get_prompt_2(
    phase1_output: str,
    similar_programs: List[Dict],
    task: Dict,
    task_id: str,
    prompter: VLMPrompter,
    verbose: bool = True,
    log_dir: str = "logs",
    
) -> str:
        
        # LOG PHASE 1 OUTPUT
        phase1_log_path = os.path.join(log_dir, f"{task_id}_phase1_output.txt")
        with open(phase1_log_path, 'w', encoding='utf-8') as f:
            f.write(f"Task ID: {task_id}\n")
            f.write("="*80 + "\n")
            f.write("PHASE 1: PATTERN DISCOVERY OUTPUT\n")
            f.write("="*80 + "\n\n")
            f.write(phase1_output)
        
        if verbose:
            print(f"   ‚úì Phase 1 complete ({len(phase1_output)} chars)", flush=True)
            print(f"   üìÑ Logged to: {phase1_log_path}", flush=True)
        
        # ====================================================================
        # PHASE 2: Code Generation
        # ====================================================================
        if verbose:
            print("\n‚öôÔ∏è  Phase 2: Code Generation...", flush=True)
        
        # Pass the raw similar_programs list - prompter will format it
        phase2_prompt = prompter.build_phase2_prompt(task, 
            phase1_output,
            similar_programs
        )
        
        return phase2_prompt
def get_prompt_str(phase_prompt):
    prompt_str = ""
    for i in phase_prompt:
        prompt_str += i["text"]
    return prompt_str

def solve_task(
    task: Dict,
    task_id: str,
    vlm_client_phase1: BaseVLMClient,
    vlm_client_phase2: BaseVLMClient,
    prompter: VLMPrompter,
    library: ProgramLibrary,
    verbose: bool = True,
    n_workers: int = None,
    timeout: int = 2,
    log_dir: str = "logs"
) -> TaskResult:
    """
    Solve a single ARC task using execution-based pipeline.
    
    Pipeline:
    1. Find similar programs by execution (parallelized)
    2. Phase 1: Pattern discovery (natural language) with similar programs
    3. Phase 2: Code generation with pattern + similar programs
    
    Args:
        task: Task dictionary with 'train' examples
        task_id: Unique task identifier
        vlm_client: VLM client for queries
        prompter: Prompt builder
        library: Program library
        verbose: Print progress
        n_workers: Number of parallel workers (None = auto)
        timeout: Timeout per program execution in seconds
        log_dir: Directory to save logs (default: "logs")
    """
    # Create log directory if it doesn't exist
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    
    if verbose:
        print(f"\n{'='*80}", flush=True)
        print(f"Solving Task: {task_id}", flush=True)
        print(f"{'='*80}", flush=True)
    
    try:
        # ====================================================================
        # STEP 1: Find Similar Programs by Execution (PARALLELIZED)
        # ====================================================================
        if verbose:
            print("\nüîç Finding similar programs by execution...", flush=True)
        
        similar_programs = library.find_similar(
            train_examples=task['train'],
            top_k=5,
            min_similarity=0.1,
            n_workers=n_workers,
            timeout=timeout
        )
        
        if verbose:
            if similar_programs:
                print(f"   Found {len(similar_programs)} similar programs:", flush=True)
                for i, prog in enumerate(similar_programs[:3], 1):
                    print(f"   {i}. Task {prog['task_id']}: {prog['similarity']:.2f}", flush=True)
            else:
                print("   No similar programs found", flush=True)
        
        # ====================================================================
        # TEST LIBRARY PROGRAMS: Try existing solutions first
        # ====================================================================
        best_library_score = 0.0
        best_library_program = None
        
        if similar_programs:
            best_match = similar_programs[0]
            best_library_score = best_match['similarity']
            best_library_program = best_match['program']
            if verbose:
                print(f"\n‚úì Best library match: Task {best_match['task_id']} ({best_library_score:.2f})", flush=True)
            
            if best_library_score == 1.0:
                if verbose:
                    print(f"   ‚Üí Perfect match found! Using library solution.", flush=True)
                return TaskResult(
                    task_id=task_id,
                    success=True,
                    score=1.0,
                    program=best_library_program
                )
                
        # ====================================================================
        # PHASE 1: Pattern Discovery (Natural Language)
        # ====================================================================
        if verbose:
            print("\nüìù Phase 1: Pattern Discovery...", flush=True)
        
        # Pass the raw similar_programs list - prompter will format it
        phase1_prompt = prompter.build_phase1_prompt(task, similar_programs)
        
        phase1_output = vlm_client_phase1.query(
            phase1_prompt,
            system_prompt=""""You are an expert at analyzing ARC puzzles and discovering transformation patterns.

Remember: Your first hypothesis is sticky and excessively convincing to you.
Combat this by evolving your hypothesis and actively seeking evidence against your initial guess to avoid halluciantion.
"""
        )
        
        # LOG PHASE 1 OUTPUT
        phase1_log_path = os.path.join(log_dir, f"{task_id}_phase1_output.txt")
        with open(phase1_log_path, 'w', encoding='utf-8') as f:
            f.write(f"Task ID: {task_id}\n")
            f.write("="*80 + "\n")
            f.write("PHASE 1: PATTERN DISCOVERY OUTPUT\n")
            f.write("="*80 + "\n\n")
            f.write(phase1_output)
        
        if verbose:
            print(f"   ‚úì Phase 1 complete ({len(phase1_output)} chars)", flush=True)
            print(f"   üìÑ Logged to: {phase1_log_path}", flush=True)
        
        # ====================================================================
        # PHASE 2: Code Generation
        # ====================================================================
        if verbose:
            print("\n‚öôÔ∏è  Phase 2: Code Generation...", flush=True)
        
        # Pass the raw similar_programs list - prompter will format it
        phase2_prompt = prompter.build_phase2_prompt(task, 
            phase1_output,
            similar_programs
        )
        
        phase2_output = vlm_client_phase2.query(
            phase2_prompt,
            system_prompt="You are an expert at generating code using the given DSL primitives to solve ARC puzzles. You are provided with a natural language description of the pattern to implement, as well as training examples and some similar programs you might find useful as reference. Generate a Python function `def solve(I):` that implements the described transformation using ONLY the provided DSL primitives. Ensure your code is syntactically correct and follows best practices."
        )
        
        if verbose:
            print(f"   ‚úì Phase 2 complete ({len(phase2_output)} chars)", flush=True)
        
        # ====================================================================
        # EXTRACT AND TEST GENERATED CODE
        # ====================================================================
        if verbose:
            print("\nüß™ Testing generated program...", flush=True)
        
        generated_code = extract_code_from_response(phase2_output)
        
        if not generated_code:
            if verbose:
                print("   ‚úó Failed to extract code", flush=True)
            
            if best_library_program and best_library_score > 0.5:
                if verbose:
                    print(f"   ‚Üí Falling back to library (score: {best_library_score:.2f})", flush=True)
                return TaskResult(
                    task_id=task_id,
                    success=False,
                    score=best_library_score,
                    program=best_library_program,
                    phase1_output=phase1_output,
                    error="Code extraction failed, using library fallback"
                )
            
            return TaskResult(
                task_id=task_id,
                success=False,
                score=0.0,
                phase1_output=phase1_output,
                error="Failed to extract code from response"
            )
        
        score, results = test_program(generated_code, task)
        
        if verbose:
            print(f"   Generated score: {score:.2f}", flush=True)
        
        # LOG PHASE 2 OUTPUT WITH TEST RESULTS
        phase2_log_path = os.path.join(log_dir, f"{task_id}_phase2_results.txt")
        with open(phase2_log_path, 'w', encoding='utf-8') as f:
            f.write(f"Task ID: {task_id}\n")
            f.write("="*80 + "\n")
            f.write("PHASE 2: CODE GENERATION & TEST RESULTS\n")
            f.write("="*80 + "\n\n")
            
            f.write("GENERATED CODE:\n")
            f.write("-"*80 + "\n")
            f.write(generated_code + "\n")
            f.write("-"*80 + "\n\n")
            
            f.write(f"SCORE: {score:.2f}\n\n")
            
            f.write("TEST RESULTS:\n")
            f.write("-"*80 + "\n")
            for i, (expected, actual, passed) in enumerate(results, 1):
                f.write(f"\nExample {i}: {'‚úì PASS' if passed else '‚úó FAIL'}\n")
                f.write(f"Expected Output:\n")
                if expected:
                    f.write(f"{json.dumps([list(row) for row in expected], indent=2)}\n")
                else:
                    f.write("None\n")
                f.write(f"Actual Output:\n")
                if actual:
                    f.write(f"{json.dumps([list(row) for row in actual], indent=2)}\n")
                else:
                    f.write("None (execution failed)\n")
                f.write("-"*40 + "\n")
        
        if verbose:
            print(f"   üìÑ Logged to: {phase2_log_path}", flush=True)
        
        # ====================================================================
        # DECIDE FINAL PROGRAM
        # ====================================================================
        success = score == 1.0
        final_program = generated_code
        final_score = score
        
        if not success and best_library_score > score:
            if verbose:
                print(f"   ‚Üí Library program better ({best_library_score:.2f} > {score:.2f})", flush=True)
            final_program = best_library_program
            final_score = best_library_score
        
        # ====================================================================
        # SAVE TO LIBRARY IF SUCCESSFUL
        # ====================================================================
        if success:
            namespace = globals().copy()
            exec(final_program, namespace)
            if 'solve' in namespace:
                library.add(task_id, final_program)
                if verbose:
                    print(f"   ‚úì Added to library", flush=True)
        
        return TaskResult(
            task_id=task_id,
            success=success,
            score=final_score,
            program=final_program,
            phase1_output=phase1_output
        )
        
    except Exception as e:
        if verbose:
            print(f"   ‚úó Error: {e}", flush=True)
        
        return TaskResult(
            task_id=task_id,
            success=False,
            score=0.0,
            error=str(e)
        )


def process_directory(
    data_dir: str,
    llm_client: LLMClient,
    prompter: VLMPrompter,
    library: ProgramLibrary,
    verbose: bool = True,
    n_workers: int = None,
    timeout: int = 2,
    evaluate_on_n_pb: int = -1,
) -> List[TaskResult]:
    """
    Process all task files in a directory.
    
    Args:
        data_dir: Directory containing task JSON files
        vlm_client: VLM client for queries
        prompter: Prompt builder
        library: Program library
        verbose: Print progress
        n_workers: Number of parallel workers for library search (None = auto)
        timeout: Timeout per program execution in seconds
    """
    data_path = Path(data_dir)
    
    if not data_path.exists():
        print(f"Error: Directory not found: {data_dir}", flush=True)
        return []
    
    json_files = sorted(data_path.glob('*.json'))
    
    if not json_files:
        print(f"No JSON files found in {data_dir}", flush=True)
        return []
    
    print(f"\nFound {len(json_files)} tasks in {data_dir}\n", flush=True)
    if evaluate_on_n_pb != -1:
        json_files = json_files[:evaluate_on_n_pb]
        print (f"Evaluating on first {evaluate_on_n_pb} tasks\n", flush=True)
    results = []
    successful = 0
    total_score = 0.0
    list_idx = []
    list_task_id = []
    list_task = []
    list_prompt_1 = []
    
    list_similar_programs = []
    for i, task_file in enumerate(json_files, 1):
        task_id = task_file.stem
        
        try:
            with open(task_file, 'r') as f:
                task = json.load(f)

            similar_programs = get_similar_programs(
                library,
                task,
                n_workers=n_workers,
                timeout=timeout
            )
            list_similar_programs.append(similar_programs)
            list_idx.append(i)
            list_task_id.append(task_id)
            list_task.append(task)

            list_prompt_1.append(get_prompt_str(get_prompt_1(
                task,
                task_id,
                prompter,
                n_workers=n_workers,
                timeout=timeout,
                verbose=verbose
            )))
            
        except json.JSONDecodeError as e:
            print(f"‚úó [{i}/{len(json_files)}] {task_id}: Invalid JSON - {e}", flush=True)
        except Exception as e:
            print(f"‚úó [{i}/{len(json_files)}] {task_id}: {e}", flush=True)
    

    out_1 = llm_client.generate(list_prompt_1)
    out_1 = [resp[0] for resp in out_1]
    list_prompt_2 = []
    for i, task_id, task, phase1_output, similar_programs in zip(list_idx, list_task_id, list_task, out_1, list_similar_programs):
        phase2_prompt = get_prompt_str(get_prompt_2(
            phase1_output,
            similar_programs,
            task,
            task_id,
            prompter,
            verbose=verbose
        ))
        list_prompt_2.append(phase2_prompt)

    out_2 = llm_client.generate(list_prompt_2)
    out_2 = [resp[0] for resp in out_2]
    list_generated_code = [extract_code_from_response(resp) for resp in out_2]
    list_results = []
    for task_id, task, generated_code, phase1_output, phase2_output in zip(list_task_id, list_task, list_generated_code, out_1, out_2):
        if not generated_code:
            return TaskResult(
                task_id=task_id,
                success=False,
                score=0.0,
                phase1_output=phase1_output,
                phase2_output=phase2_output,
                error="Failed to extract code from response"
            )
        try:
            score, results = test_program(generated_code, task)

            success = score == 1.0
            result = TaskResult(
                task_id=task_id,
                success=success,
                score=score,
                program=generated_code,
                phase1_output=phase1_output,
                phase2_output=phase2_output
            )
        except Exception as e:
            result = TaskResult(
                task_id=task_id,
                success=False,
                score=0.0,
                phase1_output=phase1_output,
                phase2_output=phase2_output,
                error=str(e)
            )
        results.append(result)
        if result.success:
            successful += 1
        
        total_score += result.score
        
        status = "‚úì" if result.success else "‚úó"
        print(f"{status} [{i}/{len(json_files)}] {task_id}: {result.score:.2f}", flush=True)

    # Summary
    print(f"\n{'='*80}", flush=True)
    print(f"SUMMARY", flush=True)
    print(f"{'='*80}", flush=True)
    print(f"Total tasks: {len(json_files)}", flush=True)
    print(f"Successful: {successful}/{len(json_files)} ({100*successful/len(json_files):.1f}%)", flush=True)
    print(f"Average score: {total_score/len(json_files):.2f}", flush=True)
    print(f"{'='*80}\n", flush=True)
    
    return results


def save_results(results: List[TaskResult], output_dir: str = 'results') -> None:
    """Save results to JSON and CSV files."""
    import csv
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # JSON
    json_file = output_path / 'results.json'
    with open(json_file, 'w') as f:
        json_data = [
            {
                'task_id': r.task_id,
                'success': r.success,
                'score': r.score,
                'error': r.error,
                'program': r.program,
            }
            for r in results
        ]
        json.dump(json_data, f, indent=2)
    print(f"Saved detailed results to {json_file}", flush=True)
    
    # CSV
    csv_file = output_path / 'summary.csv'
    with open(csv_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['task_id', 'success', 'score', 'error'])
        for r in results:
            writer.writerow([r.task_id, r.success, f'{r.score:.2f}', r.error or ''])
    print(f"Saved summary to {csv_file}", flush=True)


def main():
    from dotenv import load_dotenv
    """Main entry point"""
    # print("Initializing components...", flush=True)
    load_dotenv()
    PROVIDER = "grok"
    if PROVIDER == "grok":
        api_key = os.getenv('GROK_API_KEY')
        api_base = "https://api.x.ai/v1"
        model = "grok-4-fast"
    elif PROVIDER == "qwen":
        api_key = None
        api_base = "http://localhost:8000/v1"
        model = "Qwen/Qwen2.5-7B-Instruct"
    elif PROVIDER == "gemini":
        api_key = os.getenv('GEMINI_API_KEY')
        api_base = "https://generativelanguage.googleapis.com/v1beta/models/"
        model = "gemini-2.5-pro"
        
    vlm_config_phase1 = VLMConfig(
        api_key=api_key,
        model=model,
        api_base=api_base,
        max_tokens=16384,  # Longer for analysis
        save_prompts=False,
        prompt_log_dir="prompts_testing"
    )
    vlm_config_phase2 = VLMConfig(
        api_key=api_key,
        model=model,
        api_base=api_base,
        max_tokens=8192   # Shorter for code gen
    )
    
    vlm_client_phase1 = create_client(PROVIDER, config=vlm_config_phase1)
    # print("VLM client created", flush=True)
    
    vlm_client_phase2 = create_client(PROVIDER, config=vlm_config_phase2)
    prompter = VLMPrompter()
    # print("Prompter created", flush=True)
    
    library = ProgramLibrary()  # Auto-loads from solvers.py
    # print("Loaded library...", flush=True)
    #sanity check
    print(f"Loaded {len(library)} programs from library", flush=True)
    if len(library) > 0:
        print(f"First program: {library.programs[0]['task_id']}", flush=True)
    
    # Configure parallelization
    results = process_directory(
        data_dir='data_v1/eval_size_10',
        vlm_client_phase1=vlm_client_phase1,
        vlm_client_phase2=vlm_client_phase2,
        prompter=prompter,
        library=library,
        verbose=True,
        n_workers=None,  # Auto-detect CPUs (recommended)
        timeout=2        # 2 second timeout per program
    )
    
    save_results(results, output_dir='results/images')#TODO change output dir


# if __name__ == "__main__":
#     # print("Starting main...", flush=True)
#     sys.stdout.flush()
#     main()

In [6]:
from dotenv import load_dotenv
load_dotenv()
api_key = None
api_base = "http://localhost:8000/v1"
model = "Qwen/Qwen2.5-7B-Instruct"
PROVIDER = "qwen"
vlm_config_phase1 = VLMConfig(
    api_key=api_key,
    model=model,
    api_base=api_base,
    max_tokens=16384,  # Longer for analysis
    save_prompts=False,
    prompt_log_dir="prompts_testing"
)
vlm_config_phase2 = VLMConfig(
    api_key=api_key,
    model=model,
    api_base=api_base,
    max_tokens=8192   # Shorter for code gen
)
vlm_client_phase1 = create_client(PROVIDER, config=vlm_config_phase1)
# print("VLM client created", flush=True)

vlm_client_phase2 = create_client(PROVIDER, config=vlm_config_phase2)
prompter = VLMPrompter(use_vision=False)

library = ProgramLibrary()  # Auto-loads from solvers.py



In [9]:
print(prompter._get_phase2_dsl_section())


## DSL Primitives

**Type Definitions:**
Grid: Tuple[Tuple[int, ...], ...] - Immutable 2D array (tuple of tuples)
Object: FrozenSet[(int, (int, int))] - Set of (color, location) pairs
Patch: FrozenSet[(int, int)] or Object - Set of indices or colored object
Indices: FrozenSet[(int, int)] - Set of (row, col) positions
Objects: FrozenSet[Object] - Set of objects
Container: Tuple or FrozenSet - Generic container type
IntegerTuple: (int, int) - Tuple of integers (usually coordinates or dimensions)
Callable: Function type

**Functional Programming:**
```python
compose(outer, inner) -> Callable                    # outer(inner(x))
chain(h, g, f) -> Callable                           # h(g(f(x)))
combine_two_function_results(outer, f1, f2) -> Callable  # outer(f1(x), f2(x))
transform(f, container) -> Container                 # map
transform_and_flatten(f, container) -> FrozenSet     # map + flatten
transform_both(f, a, b) -> Tuple                     # pairwise map over tuples
transform_bot

In [None]:
prompt_dsl = prompter._get_phase2_dsl_section()
"""You are an expert at solving ARC puzzles using program synthesis given a DSL.
Your goal is given a task, generate a Python `solve(I)` function using ONLY the DSL primitives below.\n\n
{prompt_dsl}

Here are some examples of ARC tasks and their corresponding `solve(I)` functions:

"""

# content_blocks.extend(self._format_training_examples(task['train']))
# content_blocks.extend(self._format_test_examples(task['test']))

In [5]:
print(f"Loaded {len(library)} programs from library", flush=True)
if len(library) > 0:
    print(f"First program: {library.programs[0]['task_id']}", flush=True)


Loaded 400 programs from library
First program: 007bbfb7


In [6]:
data_path = "/home/flowers/work/llms_ftw/tasks/evaluation/"
data_path = Path(data_path)
json_files = sorted(data_path.glob('*.json'))
json_files[:10]

[PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/0934a4d8.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/135a2760.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/136b0064.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/13e47133.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/142ca369.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/16b78196.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/16de56c4.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/1818057f.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/195c6913.json'),
 PosixPath('/home/flowers/work/llms_ftw/tasks/evaluation/1ae2feb7.json')]

In [7]:
data_dir = "/home/flowers/work/llms_ftw/tasks/evaluation/"
verbose = True
n_workers = None
timeout = 2
evaluate_on_n_pb = 1


data_path = Path(data_dir)


json_files = sorted(data_path.glob('*.json'))


print(f"\nFound {len(json_files)} tasks in {data_dir}\n", flush=True)
if evaluate_on_n_pb != -1:
    json_files = json_files[:evaluate_on_n_pb]
    print (f"Evaluating on first {evaluate_on_n_pb} tasks\n", flush=True)
results = []
successful = 0
total_score = 0.0

# for loop here
i=1
task_file = json_files[0]
task_id = task_file.stem
with open(task_file, 'r') as f:
    task = json.load(f)

task=task
task_id=task_id
# vlm_client_phase1=vlm_client_phase1
# vlm_client_phase2=vlm_client_phase2
# prompter=prompter
# library=library
verbose=verbose
n_workers=n_workers
timeout=timeout
log_dir="logs"#TODO change log dir

# result = solve_task(
# task=task,
# task_id=task_id,
# vlm_client_phase1=vlm_client_phase1,
# vlm_client_phase2=vlm_client_phase2,
# prompter=prompter,
# library=library,
# verbose=verbose,
# n_workers=n_workers,
# timeout=timeout,
# log_dir="logs_images"#TODO change log dir
# )



Found 120 tasks in /home/flowers/work/llms_ftw/tasks/evaluation/

Evaluating on first 1 tasks



In [8]:
similar_programs = library.find_similar(
    train_examples=task['train'],
    top_k=5,
    min_similarity=0.1,
    n_workers=n_workers,
    timeout=timeout
)



=== DEBUG find_similar ===
Number of train examples: 4
Number of programs in library: 400
min_similarity threshold: 0.1
top_k: 5
Using 28 parallel workers (timeout: 2s per program)
Progress: 50/400 programs (443.9/s, 0.1s elapsed)
Progress: 100/400 programs (398.8/s, 0.3s elapsed)
Progress: 150/400 programs (490.4/s, 0.3s elapsed)
Progress: 200/400 programs (554.7/s, 0.4s elapsed)
Progress: 250/400 programs (591.1/s, 0.4s elapsed)
Progress: 300/400 programs (628.6/s, 0.5s elapsed)
Progress: 350/400 programs (599.6/s, 0.6s elapsed)
Progress: 400/400 programs (46.8/s, 8.5s elapsed)

Evaluation complete: 400/400 programs in 8.6s

=== Execution Statistics ===
Programs with errors: 29/400
Max similarity achieved: 0.110
Average similarity: 0.007

Sample errors from first 3 programs:
  228f6490: StopIteration: 
  272f95fa: StopIteration: 
  3de23699: StopIteration: 

Top 5 programs by similarity:
  1cf80156: 0.110 (errors: 0/4)
  ce4f8723: 0.103 (errors: 0/4)
  f25fbde4: 0.101 (errors: 0/4)


In [10]:
best_library_score = 0.0
best_library_program = None

if similar_programs:
    best_match = similar_programs[0]
    best_library_score = best_match['similarity']
    best_library_program = best_match['program']
    if verbose:
        print(f"\n‚úì Best library match: Task {best_match['task_id']} ({best_library_score:.2f})", flush=True)
    



‚úì Best library match: Task 1cf80156 (0.11)


In [11]:
phase1_prompt = prompter.build_phase1_prompt(task, similar_programs)
# phase1_output = vlm_client_phase1.query(
#     phase1_prompt,
#     system_prompt=""""You are an expert at analyzing ARC puzzles and discovering transformation patterns.

# Remember: Your first hypothesis is sticky and excessively convincing to you.
# Combat this by evolving your hypothesis and actively seeking evidence against your initial guess to avoid halluciantion.
# """
# )


In [12]:
prompt_str = ""
for i in phase1_prompt:
    prompt_str += i["text"]
print(prompt_str)


## Training Examples
Below are 4 training examples follwed by the test example(s) you have to generalize to:
 for each example, the input grid is shown first, followed by the output grid. 
.
Example 1:
Input:

ASCII representation:
3|5|3|3|6|6|5|4|1|4|9|9|4|3|9|9|9|9|3|4|9|9|4|1|4|5|6|6|3|3
5|3|3|3|6|6|4|5|4|1|9|9|3|4|9|1|1|9|4|3|9|9|1|4|5|4|6|6|3|3
1|1|3|5|5|4|6|6|9|1|1|4|9|9|4|5|5|4|9|9|4|1|1|9|6|6|4|5|5|3
1|1|5|3|4|5|6|6|1|9|4|1|9|1|4|4|4|4|1|9|1|4|9|1|6|6|5|4|3|5
6|9|9|9|3|5|3|3|4|3|9|9|9|2|6|9|9|6|2|9|9|9|3|4|3|3|5|3|9|9
9|6|9|9|5|3|3|3|3|4|9|1|9|9|9|6|6|9|9|9|1|9|4|3|3|3|3|5|9|9
9|9|6|9|1|1|3|5|9|9|4|4|6|9|9|2|2|9|9|6|4|4|9|9|5|3|1|1|9|6
9|9|9|6|1|1|5|3|9|1|5|4|9|6|9|9|9|9|6|9|4|5|1|9|3|5|1|1|6|9
1|4|9|1|4|3|9|9|5|5|7|2|4|3|2|4|4|2|3|4|2|7|5|5|9|9|3|4|1|9
4|1|1|9|3|4|9|1|4|5|2|7|3|4|4|2|2|4|4|3|7|2|5|4|1|9|4|3|9|1
9|9|1|4|9|9|4|5|6|4|5|5|2|4|4|3|3|4|4|2|5|5|4|6|5|4|9|9|4|1
9|9|4|1|9|1|4|4|4|5|4|5|4|2|3|4|4|3|2|4|5|4|5|4|4|4|1|9|1|4
4|3|9|9|9|9|6|9|5|9|7|7|5|5|7|2|2|7|5|5|7|7|9|5|

In [None]:
path_train= "/home/flowers/work/llms_ftw/tasks/training/"
list_task_train_id = ["29c11459",
"4612dd53",
"b782dc8a"]

list_task_train = []
for task_id in list_task_train_id:
    with open(path_train + task_id + ".json", "r") as f:
        task_data = json.load(f)
        list_task_train.append(task_data)



In [22]:
print(get_prompt_str(prompter._format_training_examples(list_task_train[2]['train'])))
print(get_prompt_str(prompter._format_test_examples(list_task_train[2]['test'])))


Below are 2 training examples follwed by the test example(s) you have to generalize to:
 for each example, the input grid is shown first, followed by the output grid. 
.
Example 1:
Input:

ASCII representation:
8|0|0|0|0|0|8|8|8|8|8|8|0|8|8|8|0|8|8|0|8|8|8|0
0|0|8|8|8|0|0|0|0|0|0|8|0|0|0|8|0|8|0|0|8|0|8|0
8|8|8|0|8|0|8|8|8|8|0|8|8|8|0|8|0|8|8|8|8|0|8|0
8|0|0|0|8|0|8|0|0|8|0|0|0|8|0|8|0|0|0|0|0|0|8|0
8|0|8|8|8|0|8|8|0|8|0|8|8|8|0|8|8|0|8|8|8|8|8|0
8|0|8|0|0|0|0|8|0|8|0|8|0|0|0|0|8|0|8|0|0|0|0|0
8|0|8|8|8|8|8|8|0|8|0|8|8|8|8|8|8|3|8|8|8|8|8|0
8|0|0|0|0|0|0|0|0|8|0|0|0|0|0|0|3|2|3|0|0|0|8|0
8|8|0|8|8|8|0|8|8|8|0|8|8|8|8|8|8|3|8|8|8|0|8|0
0|8|0|8|0|8|0|8|0|0|0|8|0|0|0|0|8|0|8|0|8|0|8|0
0|8|8|8|0|8|8|8|0|8|8|8|0|8|8|0|8|8|8|0|8|8|8|0

Output:

ASCII representation:
8|3|2|3|2|3|8|8|8|8|8|8|0|8|8|8|2|8|8|0|8|8|8|0
3|2|8|8|8|2|3|2|3|2|3|8|0|0|0|8|3|8|0|0|8|2|8|0
8|8|8|0|8|3|8|8|8|8|2|8|8|8|0|8|2|8|8|8|8|3|8|0
8|0|0|0|8|2|8|0|0|8|3|2|3|8|0|8|3|2|3|2|3|2|8|0
8|0|8|8|8|3|8|8|0|8|2|8|8|8|0|8|8|3|8

In [None]:
fs_ex="""
---------------------------
# Task 1:

Below are 2 training examples follwed by the test example(s) you have to generalize to, for each example, the input grid is shown first, followed by the output grid. 

Example 1:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0
1|0|0|0|0|0|0|0|0|0|2
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0

Output:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0
1|1|1|1|1|5|2|2|2|2|2
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0

Example 2:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
3|0|0|0|0|0|0|0|0|0|7
0|0|0|0|0|0|0|0|0|0|0

Output:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
3|3|3|3|3|5|7|7|7|7|7
0|0|0|0|0|0|0|0|0|0|0


============================================================
TEST EXAMPLES (to solve)
============================================================
Below are 1 test example(s) you need to solve:
For each test example, only the input grid is provided. You must determine the output.

Test Example 1:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0
4|0|0|0|0|0|0|0|0|0|8
0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0
6|0|0|0|0|0|0|0|0|0|9

Output: [TO BE DETERMINED]

Solution:
```python
def solve(I):
    x1 = left_half(I)
    x2 = right_half(I)
    x3 = as_objects(x2, True, False, True)
    x4 = as_objects(x1, True, False, True)
    x5 = compose(horizontal_line, center)
    x6 = combine_two_function_results(recolor, get_color, x5)
    x7 = transform_and_flatten(x6, x4)
    x8 = paint_onto_grid(x1, x7)
    x9 = transform_and_flatten(x6, x3)
    x10 = paint_onto_grid(I, x9)
    x11 = as_objects(x8, True, False, True)
    x12 = transform(upper_right_corner, x11)
    x13 = shift_by_vector(x12, RIGHT)
    x14 = flatten(x11)
    x15 = paint_onto_grid(x10, x14)
    O = fill(x15, COLOR_FIVE, x13)
    return O
```

---------------------------
# Task 2:

Below are 3 training examples follwed by the test example(s) you have to generalize to, for each example, the input grid is shown first, followed by the output grid. 

Example 1:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|0|1|0|0|1|1|0|1|0|0
0|0|1|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|1|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|1|1|0|0|1|1|0|1|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0

Output:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|2|1|2|2|1|1|2|1|0|0
0|0|1|0|0|0|0|0|0|0|2|0|0
0|0|2|0|0|0|0|0|0|0|1|0|0
0|0|2|0|0|0|0|0|0|0|2|0|0
0|0|2|0|0|0|0|0|0|0|1|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|1|1|2|2|1|1|2|1|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0

Example 2:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|1|1|0|0|1|1|0|0|0|0
0|0|1|0|0|0|0|0|1|0|0|0|0
0|0|0|0|1|0|0|0|0|0|0|0|0
0|0|1|0|1|0|0|0|1|0|0|0|0
0|0|1|0|0|0|0|0|1|0|0|0|0
0|0|0|0|1|0|0|0|1|0|0|0|0
0|0|1|1|1|1|0|1|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0

Output:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|1|1|2|2|1|1|0|0|0|0
0|0|1|0|2|0|0|0|1|0|0|0|0
0|0|2|0|1|0|0|0|2|0|0|0|0
0|0|1|0|1|0|0|0|1|0|0|0|0
0|0|1|0|2|0|0|0|1|0|0|0|0
0|0|2|0|1|0|0|0|1|0|0|0|0
0|0|1|1|1|1|2|1|2|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0

Example 3:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|1|0|1|1|0|1|1|1|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|1|1|0|1|0|1|1|0|0|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|0|0|0|0|0|0|0|0|1|0|0
0|0|1|1|0|1|1|0|0|1|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0

Output:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|1|2|1|1|2|1|1|1|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|2|0|0|0|0|0|0|0|2|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|1|1|2|1|2|1|1|2|2|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|2|0|0|0|0|0|0|0|1|0|0
0|0|1|1|2|1|1|2|2|1|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0


============================================================
TEST EXAMPLES (to solve)
============================================================
Below are 1 test example(s) you need to solve:
For each test example, only the input grid is provided. You must determine the output.

Test Example 1:
Input:

ASCII representation:
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|0|1|1|0|1|0|1|1|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|1|0|1|0|1|0|0|1|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|1|0|0|0|0|0|0|0|1|0|0
0|0|1|0|1|1|0|1|0|1|1|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0
0|0|0|0|0|0|0|0|0|0|0|0|0

Output: [TO BE DETERMINED]

Solution:
```python
def solve(I):
    x1 = of_color(I, COLOR_ONE)
    x2 = box(x1)
    x3 = fill(I, COLOR_TWO, x2)
    x4 = smallest_subgrid_containing(x1, x3)
    x5 = of_color(x4, COLOR_ONE)
    x6 = transform_and_flatten(vertical_line, x5)
    x7 = transform_and_flatten(horizontal_line, x5)
    x8 = size(x6)
    x9 = size(x7)
    x10 = greater_than(x8, x9)
    x11 = condition_if_else(x10, x7, x6)
    x12 = fill(x4, COLOR_TWO, x11)
    x13 = of_color(x12, COLOR_TWO)
    x14 = upper_left_corner(x1)
    x15 = shift_by_vector(x13, x14)
    O = fill_background(I, COLOR_TWO, x15)
    return O
```

---------------------------
# Task 3:

Below are 2 training examples follwed by the test example(s) you have to generalize to:
 for each example, the input grid is shown first, followed by the output grid. 
.
Example 1:
Input:

ASCII representation:
8|0|0|0|0|0|8|8|8|8|8|8|0|8|8|8|0|8|8|0|8|8|8|0
0|0|8|8|8|0|0|0|0|0|0|8|0|0|0|8|0|8|0|0|8|0|8|0
8|8|8|0|8|0|8|8|8|8|0|8|8|8|0|8|0|8|8|8|8|0|8|0
8|0|0|0|8|0|8|0|0|8|0|0|0|8|0|8|0|0|0|0|0|0|8|0
8|0|8|8|8|0|8|8|0|8|0|8|8|8|0|8|8|0|8|8|8|8|8|0
8|0|8|0|0|0|0|8|0|8|0|8|0|0|0|0|8|0|8|0|0|0|0|0
8|0|8|8|8|8|8|8|0|8|0|8|8|8|8|8|8|3|8|8|8|8|8|0
8|0|0|0|0|0|0|0|0|8|0|0|0|0|0|0|3|2|3|0|0|0|8|0
8|8|0|8|8|8|0|8|8|8|0|8|8|8|8|8|8|3|8|8|8|0|8|0
0|8|0|8|0|8|0|8|0|0|0|8|0|0|0|0|8|0|8|0|8|0|8|0
0|8|8|8|0|8|8|8|0|8|8|8|0|8|8|0|8|8|8|0|8|8|8|0

Output:

ASCII representation:
8|3|2|3|2|3|8|8|8|8|8|8|0|8|8|8|2|8|8|0|8|8|8|0
3|2|8|8|8|2|3|2|3|2|3|8|0|0|0|8|3|8|0|0|8|2|8|0
8|8|8|0|8|3|8|8|8|8|2|8|8|8|0|8|2|8|8|8|8|3|8|0
8|0|0|0|8|2|8|0|0|8|3|2|3|8|0|8|3|2|3|2|3|2|8|0
8|0|8|8|8|3|8|8|0|8|2|8|8|8|0|8|8|3|8|8|8|8|8|0
8|0|8|2|3|2|3|8|0|8|3|8|0|0|0|0|8|2|8|0|0|0|0|0
8|0|8|8|8|8|8|8|0|8|2|8|8|8|8|8|8|3|8|8|8|8|8|0
8|0|0|0|0|0|0|0|0|8|3|2|3|2|3|2|3|2|3|2|3|2|8|0
8|8|0|8|8|8|0|8|8|8|2|8|8|8|8|8|8|3|8|8|8|3|8|0
0|8|0|8|0|8|0|8|3|2|3|8|0|0|0|0|8|2|8|0|8|2|8|0
0|8|8|8|0|8|8|8|2|8|8|8|0|8|8|0|8|8|8|0|8|8|8|0

Example 2:
Input:

ASCII representation:
0|0|0|8|0|0|0|8|0|0|0|0|0|8
8|8|0|8|8|8|0|8|0|8|8|8|0|8
0|8|0|0|0|8|0|8|0|8|0|8|8|8
0|8|8|8|8|8|0|8|0|8|0|0|0|0
0|0|0|0|0|0|0|8|0|8|8|8|0|8
8|8|8|8|8|8|0|8|0|0|0|8|0|8
8|0|0|0|0|8|0|8|8|8|0|8|0|8
8|8|8|8|0|8|0|0|0|8|0|8|0|0
0|0|0|8|1|8|8|8|8|8|0|8|8|0
8|8|0|8|4|1|0|0|0|0|0|0|8|0
0|8|0|8|1|8|8|8|8|8|8|8|8|0
0|8|8|8|0|8|0|0|0|0|0|0|0|0
0|0|0|0|0|8|0|8|8|8|8|8|8|8

Output:

ASCII representation:
0|0|0|8|0|0|0|8|1|4|1|4|1|8
8|8|0|8|8|8|0|8|4|8|8|8|4|8
0|8|0|0|0|8|0|8|1|8|0|8|8|8
0|8|8|8|8|8|0|8|4|8|0|0|0|0
0|0|0|0|0|0|0|8|1|8|8|8|0|8
8|8|8|8|8|8|0|8|4|1|4|8|0|8
8|4|1|4|1|8|0|8|8|8|1|8|0|8
8|8|8|8|4|8|0|0|0|8|4|8|0|0
0|0|0|8|1|8|8|8|8|8|1|8|8|0
8|8|0|8|4|1|4|1|4|1|4|1|8|0
1|8|0|8|1|8|8|8|8|8|8|8|8|0
4|8|8|8|4|8|0|0|0|0|0|0|0|0
1|4|1|4|1|8|0|8|8|8|8|8|8|8


============================================================
TEST EXAMPLES (to solve)
============================================================
Below are 1 test example(s) you need to solve:
For each test example, only the input grid is provided. You must determine the output.

Test Example 1:
Input:

ASCII representation:
8|8|0|8|0|0|8|0|0|0|0|0|0|0|0
0|8|0|8|8|8|8|4|8|8|8|8|8|8|8
0|8|0|0|0|0|4|3|8|0|0|0|0|0|8
0|8|8|8|8|8|8|4|8|8|8|0|8|8|8
0|0|0|0|0|0|8|0|0|0|8|0|8|0|0
8|8|8|8|8|0|8|8|8|0|8|0|8|0|8
0|0|0|0|8|0|0|0|8|0|8|0|8|0|8
8|8|8|0|8|8|8|0|8|0|8|0|8|8|8
0|0|8|0|0|0|8|0|8|0|8|0|0|0|0
8|0|8|8|8|0|8|8|8|0|8|8|8|0|8
8|0|0|0|8|0|0|0|0|0|0|0|8|0|8
8|8|8|0|8|0|8|8|8|8|8|8|8|0|8
0|0|8|0|8|0|8|0|0|0|0|0|0|0|8
8|0|8|8|8|0|8|0|8|8|8|8|8|8|8
8|0|0|0|0|0|8|0|8|0|0|0|0|0|0

Output: [TO BE DETERMINED]

Solution:
```python
def solve_b782dc8a(I):
    x1 = least_common_color(I)
    x2 = as_objects(I, True, False, False)
    x3 = of_color(I, x1)
    x4 = get_first(x3)
    x5 = direct_neighbors(x4)
    x6 = to_object(x5, I)
    x7 = most_common_color(x6)
    x8 = of_color(I, x7)
    x9 = color_filter(x2, COLOR_ZERO)
    x10 = fix_last_argument(adjacent, x8)
    x11 = keep_if_condition_and_flatten(x9, x10)
    x12 = to_indices(x11)
    x13 = fix_last_argument(manhattan_distance, x3)
    x14 = chain(is_even, x13, initset)
    x15 = keep_if_condition(x12, x14)
    x16 = difference(x12, x15)
    x17 = fill(I, x1, x15)
    O = fill(x17, x7, x16)
    return O
```

---------------------------
Now you need to solve the following Task:
"""

In [None]:
def solve_29c11459(I):
    x1 = left_half(I)
    x2 = right_half(I)
    x3 = as_objects(x2, True, False, True)
    x4 = as_objects(x1, True, False, True)
    x5 = compose(horizontal_line, center)
    x6 = combine_two_function_results(recolor, get_color, x5)
    x7 = transform_and_flatten(x6, x4)
    x8 = paint_onto_grid(x1, x7)
    x9 = transform_and_flatten(x6, x3)
    x10 = paint_onto_grid(I, x9)
    x11 = as_objects(x8, True, False, True)
    x12 = transform(upper_right_corner, x11)
    x13 = shift_by_vector(x12, RIGHT)
    x14 = flatten(x11)
    x15 = paint_onto_grid(x10, x14)
    O = fill(x15, COLOR_FIVE, x13)
    return O


def solve_4612dd53(I):
    x1 = of_color(I, COLOR_ONE)
    x2 = box(x1)
    x3 = fill(I, COLOR_TWO, x2)
    x4 = smallest_subgrid_containing(x1, x3)
    x5 = of_color(x4, COLOR_ONE)
    x6 = transform_and_flatten(vertical_line, x5)
    x7 = transform_and_flatten(horizontal_line, x5)
    x8 = size(x6)
    x9 = size(x7)
    x10 = greater_than(x8, x9)
    x11 = condition_if_else(x10, x7, x6)
    x12 = fill(x4, COLOR_TWO, x11)
    x13 = of_color(x12, COLOR_TWO)
    x14 = upper_left_corner(x1)
    x15 = shift_by_vector(x13, x14)
    O = fill_background(I, COLOR_TWO, x15)
    return O


def solve_b782dc8a(I):
    x1 = least_common_color(I)
    x2 = as_objects(I, True, False, False)
    x3 = of_color(I, x1)
    x4 = get_first(x3)
    x5 = direct_neighbors(x4)
    x6 = to_object(x5, I)
    x7 = most_common_color(x6)
    x8 = of_color(I, x7)
    x9 = color_filter(x2, COLOR_ZERO)
    x10 = fix_last_argument(adjacent, x8)
    x11 = keep_if_condition_and_flatten(x9, x10)
    x12 = to_indices(x11)
    x13 = fix_last_argument(manhattan_distance, x3)
    x14 = chain(is_even, x13, initset)
    x15 = keep_if_condition(x12, x14)
    x16 = difference(x12, x15)
    x17 = fill(I, x1, x15)
    O = fill(x17, x7, x16)
    return O



In [13]:
out = llm_client.generate([prompt_str])

send 1 / 1 messages
[2025-11-07 15:45:35] Prefill batch, #new-seq: 1, #new-token: 2048, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0, 
[2025-11-07 15:45:35] Prefill batch, #new-seq: 1, #new-token: 2048, #cached-token: 0, token usage: 0.03, #running-req: 0, #queue-req: 0, 
[2025-11-07 15:45:35] Prefill batch, #new-seq: 1, #new-token: 2048, #cached-token: 0, token usage: 0.06, #running-req: 0, #queue-req: 0, 
[2025-11-07 15:45:35] Prefill batch, #new-seq: 1, #new-token: 2048, #cached-token: 0, token usage: 0.09, #running-req: 0, #queue-req: 0, 
[2025-11-07 15:45:36] Prefill batch, #new-seq: 1, #new-token: 2048, #cached-token: 0, token usage: 0.12, #running-req: 0, #queue-req: 0, 
[2025-11-07 15:45:36] Prefill batch, #new-seq: 1, #new-token: 64, #cached-token: 0, token usage: 0.15, #running-req: 0, #queue-req: 0, 
[2025-11-07 15:45:37] Decode batch, #running-req: 1, #token: 10337, token usage: 0.16, cuda graph: True, gen throughput (token/s): 0.36, #queue-req: 0, 
[

  print(' /!\ Server is running /!\ ')


KeyboardInterrupt: 

nanobind: leaked 1 instances!
 - leaked instance 0x7f38b31ad4b8 of type "xgrammar.xgrammar_bindings.GrammarCompiler"
nanobind: leaked 1 types!
 - leaked type "xgrammar.xgrammar_bindings.GrammarCompiler"
nanobind: leaked 9 functions!
 - leaked function "compile_regex"
 - leaked function ""
 - leaked function "compile_builtin_json_grammar"
 - leaked function "compile_grammar"
 - leaked function "compile_json_schema"
 - leaked function "clear_cache"
 - leaked function "get_cache_size_bytes"
 - leaked function "__init__"
 - leaked function "compile_structural_tag"
nanobind: this is likely caused by a reference counting issue in the binding code.


In [11]:
phase1_prompt

[{'type': 'text', 'text': '## Training Examples\n'},
 {'type': 'text',
  'text': 'Below are 4 training examples:\n for each example, the input grid is shown first, followed by the output grid.\n.'},
 {'type': 'text', 'text': '\nExample 1:\n'},
 {'type': 'text', 'text': 'Input:\n'},
 {'type': 'text',
  'text': '\nASCII representation:\n3|5|3|3|6|6|5|4|1|4|9|9|4|3|9|9|9|9|3|4|9|9|4|1|4|5|6|6|3|3\n5|3|3|3|6|6|4|5|4|1|9|9|3|4|9|1|1|9|4|3|9|9|1|4|5|4|6|6|3|3\n1|1|3|5|5|4|6|6|9|1|1|4|9|9|4|5|5|4|9|9|4|1|1|9|6|6|4|5|5|3\n1|1|5|3|4|5|6|6|1|9|4|1|9|1|4|4|4|4|1|9|1|4|9|1|6|6|5|4|3|5\n6|9|9|9|3|5|3|3|4|3|9|9|9|2|6|9|9|6|2|9|9|9|3|4|3|3|5|3|9|9\n9|6|9|9|5|3|3|3|3|4|9|1|9|9|9|6|6|9|9|9|1|9|4|3|3|3|3|5|9|9\n9|9|6|9|1|1|3|5|9|9|4|4|6|9|9|2|2|9|9|6|4|4|9|9|5|3|1|1|9|6\n9|9|9|6|1|1|5|3|9|1|5|4|9|6|9|9|9|9|6|9|4|5|1|9|3|5|1|1|6|9\n1|4|9|1|4|3|9|9|5|5|7|2|4|3|2|4|4|2|3|4|2|7|5|5|9|9|3|4|1|9\n4|1|1|9|3|4|9|1|4|5|2|7|3|4|4|2|2|4|4|3|7|2|5|4|1|9|4|3|9|1\n9|9|1|4|9|9|4|5|6|4|5|5|2|4|4|3|3|4|4|2|5|5|4|6|5|4|9

In [10]:
phase2_prompt = prompter.build_phase2_prompt(task, 
    phase1_output,
    similar_programs
)

phase2_output = vlm_client_phase2.query(
    phase2_prompt,
    system_prompt="You are an expert at generating code using the given DSL primitives to solve ARC puzzles. You are provided with a natural language description of the pattern to implement, as well as training examples and some similar programs you might find useful as reference. Generate a Python function `def solve(I):` that implements the described transformation using ONLY the provided DSL primitives. Ensure your code is syntactically correct and follows best practices."
)

if verbose:
    print(f"   ‚úì Phase 2 complete ({len(phase2_output)} chars)", flush=True)


NameError: name 'phase1_output' is not defined

In [19]:
generated_code = extract_code_from_response(phase2_output)
score, results = test_program(generated_code, task)
print(f"   Generated score: {score:.2f}", flush=True)


   Generated score: 0.13


In [None]:
python -m sglang.launch_server --model-path /home/flowers/work/hf/Qwen3-4B-Instruct-2507 --tp 1 --port 8000 --mem-fraction-static 0.9 --random-seed 42 --host 0.0.0.0 --log-level info --trust-remote-code --quantization fp8 --context-length 32000 


In [None]:
results = process_directory(
    data_dir='data_v1/eval_size_10',
    vlm_client_phase1=vlm_client_phase1,
    vlm_client_phase2=vlm_client_phase2,
    prompter=prompter,
    library=library,
    verbose=True,
    n_workers=None,  # Auto-detect CPUs (recommended)
    timeout=2        # 2 second timeout per program
)

# save_results(results, output_dir='results/images')#TODO change output dir
