In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import json
import sys
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
# Kaggle paths
SOLVER_DIR = "/kaggle/input/arc-ttt/arc_ttt"
EVAL_FILE = "/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json"
OUTPUT_DIR = "/kaggle/working"

# Load API key from Kaggle secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["OPENROUTER_API_KEY"] = user_secrets.get_secret("OPENROUTER_API_KEY")

sys.path.insert(0, SOLVER_DIR)
os.chdir(SOLVER_DIR)

from orchestrator import solve_with_judge
from config import reset_usage_stats, get_usage_stats

print("=" * 60)
print("ARC TTT SOLVER - Evaluation Mode")
print("=" * 60)
print(f"Ensemble Dir: {SOLVER_DIR}")
print(f"Eval File: {EVAL_FILE}")
print(f"Output Dir: {OUTPUT_DIR}")

In [None]:
# Configuration
MAX_CONCURRENT = 60

SUBMISSION_FILE = os.path.join(OUTPUT_DIR, "submission.json")

print(f"Submission file: {SUBMISSION_FILE}")

submission_data = {}
FALLBACK = [[0, 0], [0, 0]]

In [None]:
# Load all tasks from single JSON file
with open(EVAL_FILE) as f:
    all_tasks = json.load(f)

TASKS = list(all_tasks.keys())
print(f"Loaded {len(TASKS)} tasks from evaluation challenges")

In [None]:
def update_submission_file(task_id: str, result: dict, task_data: dict):
    """Update submission file with task results."""
    global submission_data
    
    n_tests = len(task_data.get('test', []))
    attempt_1 = result.get("attempt_1")
    attempt_2 = result.get("attempt_2")
    
    preds = []
    for i in range(n_tests):
        if n_tests == 1:
            a1 = attempt_1 if attempt_1 is not None else FALLBACK
            a2 = attempt_2 if attempt_2 is not None else FALLBACK
        else:
            # Multi-test case handling
            if isinstance(attempt_1, list) and len(attempt_1) > i and isinstance(attempt_1[i], list):
                a1 = attempt_1[i]
            else:
                a1 = FALLBACK
            if isinstance(attempt_2, list) and len(attempt_2) > i and isinstance(attempt_2[i], list):
                a2 = attempt_2[i]
            else:
                a2 = FALLBACK
        preds.append({"attempt_1": a1, "attempt_2": a2})
    
    submission_data[task_id] = preds
    
    # Save incrementally
    with open(SUBMISSION_FILE, 'w') as f:
        json.dump(submission_data, f)
    
    return len(submission_data)

## Run Tasks

In [None]:
import gc

reset_usage_stats()
completed = 0
source_counts = {"attempt_1": {}, "attempt_2": {}}

def process_task(task_id: str) -> dict:
    """Process a single task."""
    task_data = all_tasks[task_id]
    # No ground truth for evaluation tasks
    return solve_with_judge(task_data, task_id, ground_truths=None, verbose=False)

with ThreadPoolExecutor(max_workers=MAX_CONCURRENT) as executor:
    futures = {executor.submit(process_task, tid): tid for tid in TASKS}
    
    for future in as_completed(futures):
        task_id = futures[future]
        try:
            result = future.result()
            result["task_id"] = task_id
            
            # Track sources
            a1_src = result.get("attempt_1_source", "none")
            a2_src = result.get("attempt_2_source", "none")
            source_counts["attempt_1"][a1_src] = source_counts["attempt_1"].get(a1_src, 0) + 1
            source_counts["attempt_2"][a2_src] = source_counts["attempt_2"].get(a2_src, 0) + 1
            
            completed += 1
            
            # Update submission
            task_data = all_tasks[task_id]
            n_submitted = update_submission_file(task_id, result, task_data)
            
            print(f"[{completed}/{len(TASKS)}] {task_id}: âœ“ | a1={a1_src} a2={a2_src} | {n_submitted} tasks saved")
            
            # Memory cleanup
            if 'all_outputs' in result:
                del result['all_outputs']
            gc.collect()
            
        except Exception as e:
            import traceback
            print(f"[{completed}/{len(TASKS)}] {task_id}: ERROR - {e}")
            traceback.print_exc()
            completed += 1
            
            # Still add fallback to submission
            task_data = all_tasks[task_id]
            n_tests = len(task_data.get('test', []))
            submission_data[task_id] = [{"attempt_1": FALLBACK, "attempt_2": FALLBACK} for _ in range(n_tests)]
            with open(SUBMISSION_FILE, 'w') as f:
                json.dump(submission_data, f)

print(f"\n{'='*60}")
print(f"COMPLETE")
print(f"{'='*60}")
print(f"  Tasks processed: {completed}/{len(TASKS)}")
print(f"  Submission file: {SUBMISSION_FILE}")
print(f"\n  Attempt 1 Sources:")
for src, cnt in sorted(source_counts["attempt_1"].items(), key=lambda x: -x[1]):
    print(f"    {src}: {cnt}")
print(f"\n  Attempt 2 Sources:")
for src, cnt in sorted(source_counts["attempt_2"].items(), key=lambda x: -x[1]):
    print(f"    {src}: {cnt}")
print(f"\n{'='*60}")
print(get_usage_stats())

In [None]:
# Verify submission format
with open(SUBMISSION_FILE) as f:
    sub = json.load(f)

print(f"Submission contains {len(sub)} tasks")
print(f"\nSample entry:")
sample_id = list(sub.keys())[0]
print(f"  {sample_id}: {sub[sample_id]}")