In [1]:
import os
from pathlib import Path

# ============================================================================
# Model Configuration
# ============================================================================

# # Model for initial inference (path for Kaggle, slug for local/RunPod)
INITIAL_MODEL_HF = "Trelis/Soar-qwen-14b-FP8-Dynamic"  # For local/RunPod
INITIAL_MODEL_KAGGLE = "arc-1-fake-ttt-blended-c802-dataset"  # Kaggle dataset name

# # Model for Refinement
REFINEMENT_MODEL_HF = "Trelis/Soar-qwen-14b-FP8-Dynamic"  # For local/RunPod
REFINEMENT_MODEL_KAGGLE = "arc-1-fake-ttt-blended-c802-dataset"  # Kaggle dataset name

# ============================================================================
# Inference Configuration
# ============================================================================

# Test Attempts (used for first and second/refinement)
TEST_ATTEMPTS = 4
KAGGLE_TIMEOUT = 3600*11 # Allow only 11 hours for inference, one hour buffer. NOT USED AT PRESENT!

# First inference settings
FIRST_ATTEMPTS = 128     # Number of attempts for first inference
FIRST_WORKERS = 64       # Number of workers for first inference

# Second inference settings
SECOND_ATTEMPTS = 64     # Number of attempts for second inference
SECOND_WORKERS = 64      # Number of workers for second inference

# ============================================================================
# Other Configuration
# ============================================================================

DATASET = "arc-prize-2025"

# ---- Config flags (single source of truth) ----
START_SERVER = True
TEST_INFERENCE = True
SCORE = True                   # default; overridden below, depending on flags

# Refinement mode?
ENABLE_REFINEMENT=True

# Env-backed flags
IS_KAGGLE = bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
IS_RERUN  = IS_KAGGLE and os.getenv("KAGGLE_IS_COMPETITION_RERUN", "").lower() == "true"

# String env flag for external tools
os.environ["SUBMIT"] = "true"

# ---- Paths ----
if IS_KAGGLE:
    ARC_DATA_ROOT   = Path("/kaggle/input")
    MODEL_SAVE_DIR = Path("/kaggle/working")
    SUBMIT_DIR      = Path("/kaggle/working")
    ARC_PROGRAMS_PARQUET = SUBMIT_DIR

    print("üîç Searching for models in Kaggle environment...")

    # Auto-find initial model path in Kaggle's dataset structure
    model_dataset_path = ARC_DATA_ROOT / INITIAL_MODEL_KAGGLE
    print(f"   Looking for initial model dataset: {model_dataset_path}")

    if model_dataset_path.exists() and model_dataset_path.is_dir():
        # Kaggle datasets have version folders, find the first subdirectory
        subdirs = [d for d in model_dataset_path.iterdir() if d.is_dir()]
        if subdirs:
            MODEL_PATH = subdirs[0]  # Use the first (usually only) version folder
            print(f"   ‚úÖ Found initial model at: {MODEL_PATH}")
            # List what's inside to confirm it's right
            model_contents = list(MODEL_PATH.iterdir())[:5]  # Show first 5 items
            print(f"      Contents: {[f.name for f in model_contents]}")
        else:
            # Fallback if no subdirectory found
            MODEL_PATH = model_dataset_path
            print(f"   ‚ö†Ô∏è No version folder found for initial model, using: {MODEL_PATH}")
    else:
        MODEL_PATH = model_dataset_path
        print(f"   ‚ùå Initial model dataset not found at: {MODEL_PATH}")
        print(f"      Available datasets: {[d.name for d in ARC_DATA_ROOT.iterdir() if d.is_dir()][:10]}")

    # Auto-find refinement base model path
    refinement_dataset_path = ARC_DATA_ROOT / REFINEMENT_MODEL_KAGGLE
    print(f"   Looking for refinement base dataset: {refinement_dataset_path}")

    if refinement_dataset_path.exists() and refinement_dataset_path.is_dir():
        subdirs = [d for d in refinement_dataset_path.iterdir() if d.is_dir()]
        if subdirs:
            REFINEMENT_MODEL_PATH = subdirs[0]
            print(f"   ‚úÖ Found refinement base at: {REFINEMENT_MODEL_PATH}")
            # List what's inside to confirm it's right
            finetune_contents = list(REFINEMENT_MODEL_PATH.iterdir())[:5]  # Show first 5 items
            print(f"      Contents: {[f.name for f in finetune_contents]}")
        else:
            REFINEMENT_MODEL_PATH = refinement_dataset_path
            print(f"   ‚ö†Ô∏è No version folder found for refinement base, using: {REFINEMENT_MODEL_PATH}")
    else:
        REFINEMENT_MODEL_PATH = refinement_dataset_path
        print(f"   ‚ùå Refinement base dataset not found at: {REFINEMENT_MODEL_PATH}")

    print(f"\nüì¶ Final model paths:")
    print(f"   Initial model: {MODEL_PATH}")
    print(f"   Refinement base: {REFINEMENT_MODEL_PATH}")

else:
    ARC_DATA_ROOT   = Path("/workspace/arc-agi-2025/data")
    MODEL_SAVE_DIR = Path("/workspace/arc-agi-2025/llm_python/fine-tuning")
    SUBMIT_DIR      = Path("/workspace/arc-agi-2025/llm_python/submissions")
    ARC_PROGRAMS_PARQUET = Path("/workspace/arc-agi-2025/llm_python/datasets/inference")

    # Use local/RunPod model paths
    MODEL_PATH = INITIAL_MODEL_HF
    REFINEMENT_MODEL_PATH = REFINEMENT_MODEL_HF

    print(f"üì¶ Local/RunPod model paths:")
    print(f"   Initial model: {MODEL_PATH}")
    print(f"   Refinement base: {REFINEMENT_MODEL_PATH}")

# Set up paths - parquet files are saved by task runner in different locations
if IS_KAGGLE:
    # On Kaggle, parquet files are saved directly in /kaggle/working by task runner
    inference_dir = "/kaggle/working"
else:
    # On RunPod/local, parquet files are saved in llm_python/datasets/inference
    inference_dir = "llm_python/datasets/inference"

# Export envs for downstream processes
os.environ["ARC_DATA_ROOT"]   = str(ARC_DATA_ROOT)
os.environ["MODEL_SAVE_DIR"] = str(MODEL_SAVE_DIR)
os.environ["SUBMIT_DIR"]      = str(SUBMIT_DIR)
os.environ["ARC_PROGRAMS_PARQUET"] = str(ARC_PROGRAMS_PARQUET)
os.environ["MODEL_PATH"] = str(MODEL_PATH)

# Export config flags for subprocess use
os.environ["IS_KAGGLE"] = str(IS_KAGGLE).lower()
os.environ["IS_RERUN"] = str(IS_RERUN).lower()
os.environ["DATASET"] = DATASET

# Ensure directories exist
for p in (MODEL_SAVE_DIR, SUBMIT_DIR):
    p.mkdir(parents=True, exist_ok=True)

if IS_RERUN:
    # Kaggle competition rerun
    timeout_seconds = KAGGLE_TIMEOUT
    print(f"üèÜ Competition rerun detected ‚Äî setting FULL {timeout_seconds}s timeout for all inference")
    SCORE = False
    os.environ["SUBMIT"] = "true"

elif not IS_KAGGLE:
    # Runpod / local long run
    timeout_seconds = None
    print(f"üñ•Ô∏è Runpod/local long run ‚Äî setting no timeout for inference")
    if os.getenv("SUBMIT", "false").lower() == "true":
        SCORE = True  # if we're generating a submission, do scoring

else:
    # Kaggle dev/testing
    timeout_seconds = int(KAGGLE_TIMEOUT * TEST_ATTEMPTS / (2 * (FIRST_ATTEMPTS + SECOND_ATTEMPTS)))   # 1 minute
    print(f"üîß Development run ‚Äî setting short {timeout_seconds}s timeout for testing")
    # Safer default: don't auto-submit in dev
    os.environ["SUBMIT"] = "true"
    FIRST_ATTEMPTS = TEST_ATTEMPTS
    SECOND_ATTEMPTS = TEST_ATTEMPTS

# ENABLE_REFINEMENT Mode configuration
if ENABLE_REFINEMENT:
    print("üß™ ENABLE_REFINEMENT ENABLED")
    print("   ‚Üí Will run: First inference ‚Üí Second inference")
    print(f"   ‚Üí First inference: {FIRST_ATTEMPTS} attempts, {FIRST_WORKERS} workers")
    print(f"   ‚Üí Second inference: {SECOND_ATTEMPTS} attempts, {SECOND_WORKERS} workers")
else:
    print("üîÑ Standard mode (ENABLE_REFINEMENT disabled)")
    print(f"   ‚Üí Will run: First inference only ({FIRST_ATTEMPTS} attempts, {FIRST_WORKERS} workers)")

# Optional: quick summary (helps avoid accidental submits)
print(
    "Mode summary ‚Üí "
    f"IS_KAGGLE={IS_KAGGLE} | IS_RERUN={IS_RERUN} | ENABLE_REFINEMENT={ENABLE_REFINEMENT} |\n"
    f"TEST_INFERENCE={TEST_INFERENCE} | SCORE={SCORE} | SUBMIT={os.environ['SUBMIT']} | INITIAL_MODEL={MODEL_PATH} | REFINEMENT_MODEL={REFINEMENT_MODEL_PATH}"
)

üîç Searching for models in Kaggle environment...
   Looking for initial model dataset: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset
   ‚úÖ Found initial model at: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic
      Contents: ['model.safetensors.index.json', 'model-00003-of-00004.safetensors', 'config.json', 'merges.txt', '.cache']
   Looking for refinement base dataset: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset
   ‚úÖ Found refinement base at: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic
      Contents: ['model.safetensors.index.json', 'model-00003-of-00004.safetensors', 'config.json', 'merges.txt', '.cache']

üì¶ Final model paths:
   Initial model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic
   Refinement base: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic
üîß Development run ‚Äî setting short 412s timeout for testing
üß™ ENABLE_REFINEMENT ENABL

In [2]:
import sys
import torch
import numpy as np

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version (PyTorch): {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"NumPy version: {np.__version__}")
if torch.cuda.is_available():
   print(f"GPU count: {torch.cuda.device_count()}")
   print(f"GPU name: {torch.cuda.get_device_name(0)}")

Python version: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
PyTorch version: 2.8.0+cu128
CUDA version (PyTorch): 12.8
CUDA available: True
NumPy version: 1.26.4
GPU count: 4
GPU name: NVIDIA L4


In [3]:
import sglang
print("SGLang version:", sglang.__version__)

try:
    import flashinfer
    print("FlashInfer version:", flashinfer.__version__)
except ImportError:
    print("FlashInfer not installed")

SGLang version: 0.5.2


W0914 19:17:45.830000 19 arc_agi_2025_aux_rewrite_refine/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
W0914 19:17:45.830000 19 arc_agi_2025_aux_rewrite_refine/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.


FlashInfer version: 0.3.1


In [4]:
if IS_KAGGLE:
    import os, shutil, subprocess, stat
    
    # 1) Where to place binaries + cache
    WRK_BIN = "/kaggle/working/bin"
    TRITON_CACHE = "/kaggle/working/.triton"
    os.makedirs(WRK_BIN, exist_ok=True)
    os.makedirs(TRITON_CACHE, exist_ok=True)
    
    # 2) Preferred source for ptxas/cuobjdump/nvdisasm
    SYSTEM_CUDA_BIN = "/usr/local/cuda/bin"
    FALLBACK_VENDORED = "/kaggle/usr/lib/sglang_utility/triton/backends/nvidia/bin"  # if you have it
    
    def copy_tool(name: str):
        for src_dir in (SYSTEM_CUDA_BIN, FALLBACK_VENDORED):
            src = os.path.join(src_dir, name)
            if os.path.exists(src):
                dst = os.path.join(WRK_BIN, name)
                shutil.copy2(src, dst)
                # ensure executable bit
                os.chmod(dst, os.stat(dst).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
                return dst
        raise FileNotFoundError(f"Could not find {name} in {SYSTEM_CUDA_BIN} or {FALLBACK_VENDORED}")
    
    ptxas_path = copy_tool("ptxas")
    try:
        cuobjdump_path = copy_tool("cuobjdump")
    except FileNotFoundError:
        cuobjdump_path = None  # optional
    try:
        nvdisasm_path = copy_tool("nvdisasm")
    except FileNotFoundError:
        nvdisasm_path = None  # optional
    
    # 3) Environment for Triton/JIT
    os.environ["TRITON_PTXAS_PATH"] = ptxas_path
    os.environ["PATH"] = f"{WRK_BIN}:{os.environ.get('PATH','')}"
    os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE
    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["CUDA_PATH"] = "/usr/local/cuda"
    
    # Helpful fallbacks if you still hit capture issues:
    # os.environ["SGLANG_DISABLE_CUDA_GRAPH"] = "1"      # skip CUDA graphs (degrades perf but avoids capture)
    # os.environ["TRITON_CODEGEN_FATBIN"] = "0"          # can reduce Triton fatbin steps on some setups
    
    # 4) Smoke test: ensure ptxas runs from the new location
    print("ptxas ->", subprocess.check_output([ptxas_path, "--version"]).decode().strip())
    
    # Now it's safe to import heavy libs that trigger Triton
    import torch

ptxas -> ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:14:54_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [5]:
if START_SERVER:
  # Background server launcher for Kaggle with SGLang
  import os, sys, time, subprocess, json, socket, requests

  # ---------- 1) Check for existing server and cleanup ----------
  PORT = 8080
  HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"

  # Check if server already running
  try:
      r = requests.get(HEALTH_URL, timeout=3)
      if r.status_code == 200:
          print(f"Server already running on port {PORT}. Stopping it first...")
          # Kill existing sglang processes
          subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)
          time.sleep(3)  # Wait for cleanup
  except:
      pass  # No server running

  # Clear CUDA memory before starting
  try:
      import torch
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
          torch.cuda.synchronize()
          print("CUDA memory cleared.")
      num_gpus = torch.cuda.device_count()
  except Exception:
      num_gpus = 0
      
  model_path_to_use = str(MODEL_PATH)
  print(f"üîß Using model from {model_path_to_use}")

  LOG = f"{SUBMIT_DIR}/sglang_server.log"
  print(f"LOG file path: {LOG}")

  SERVER_CMD = [
      sys.executable, "-m", "sglang.launch_server",
      "--host", "0.0.0.0",
      "--port", str(PORT),
      "--model-path", model_path_to_use,
      "--dp", str(max(1, min(num_gpus, 4))),
      "--kv-cache-dtype", "fp8_e4m3",
      "--enable-metrics",
  ]

  # ---------- 2) Launch in background ----------
  log_f = open(LOG, "w")
  env = os.environ.copy()
  proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT, env=env, cwd=SUBMIT_DIR)
  print(f"Started sglang server PID={proc.pid} | logging to {LOG}")
  print("Command:", " ".join(SERVER_CMD))

  # ---------- 3) Wait for readiness ----------
  def wait_ready(url, timeout_s=180):
      t0 = time.time()
      while time.time() - t0 < timeout_s:
          try:
              r = requests.get(url, timeout=3)
              if r.status_code == 200:
                  return True
          except Exception:
              pass
          time.sleep(2)
      return False

  ready = wait_ready(HEALTH_URL)
  log_f.flush()

  if ready:
      print(f"sglang is READY on port {PORT}.")
      print(f"- Tail logs: !tail -n 50 {LOG}")
      print(f"- List models: !curl -s http://127.0.0.1:{PORT}/v1/models | jq .")
  else:
      print(f"sglang not ready after timeout. Showing last 60 log lines:")
      log_f.close()
      !tail -n 60 {LOG}

  # ---------- 4) Cleanup functions ----------
  def stop_server(p=proc):
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()
      print("Server stopped.")

  def full_cleanup(p=proc):
      # Stop server
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()

      # Also kill any lingering sglang processes
      subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)

      # Clear CUDA memory
      try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
      except:
          pass

      print("Server stopped and CUDA memory cleared.")

  print("Call stop_server() or full_cleanup() to shut it down gracefully.")

CUDA memory cleared.
üîß Using model from /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic
LOG file path: /kaggle/working/sglang_server.log
Started sglang server PID=42 | logging to /kaggle/working/sglang_server.log
Command: /usr/bin/python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic --dp 4 --kv-cache-dtype fp8_e4m3 --enable-metrics
sglang not ready after timeout. Showing last 60 log lines:
2025-09-14 19:18:20.041544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757877500.300974      42 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757877500.379904      42 cuda_blas.cc:1418] Unable to register cuBLAS factory: At

In [6]:
if START_SERVER:
    import requests
    import time
    
    def check_models():
        url = "http://127.0.0.1:8080/v1/models"
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            result = response.json()
    
            print("‚úÖ Server is responding!")
            print("Available models:")
            for model in result['data']:
                print(f"  - {model['id']}")
    
            return result['data'][0]['id'] if result['data'] else None
    
        except requests.exceptions.ConnectionError:
            print("‚ùå Connection failed - server may not be ready yet")
            return None
        except Exception as e:
            print(f"‚ùå Error: {e}")
            return None
    
    # Poll every 30 seconds until we get a model
    model_name = None
    while not model_name:
        model_name = check_models()
        if not model_name:
            print("‚è≥ Waiting 30 seconds before retrying...")
            time.sleep(30)
    
    print(f"\n‚úÖ Found model: {model_name}")

‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Waiting 30 seconds before retrying...
‚ùå Connection failed - server may not be ready yet
‚è≥ Wait

In [7]:
if TEST_INFERENCE:
    import time
    import requests
    
    url = "http://127.0.0.1:8080/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json"
    }
    
    messages = [
        {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
        {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10\u00d710 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
    ]
    
    payload = {
        "model": model_name,  # from your polling loop
        "messages": messages,
        # "max_tokens": 1000
        "max_tokens": 10
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload, timeout=600)
    end_time = time.time()
    
    response.raise_for_status()
    result = response.json()
    output_text = result["choices"][0]["message"]["content"]
    
    # Estimate token count (4 chars/token assumption)
    estimated_tokens = len(output_text) / 4
    elapsed_time = end_time - start_time
    tokens_per_second = estimated_tokens / elapsed_time
    
    print("‚úÖ Response received:")
    print(output_text)
    print(f"\n‚è± Elapsed time: {elapsed_time:.2f} seconds")
    print(f"üî¢ Estimated tokens: {estimated_tokens:.1f}")
    print(f"‚ö° Output tokens/sec: {tokens_per_second:.2f}")

‚úÖ Response received:
```python
def transform(grid):
    transformed_grid

‚è± Elapsed time: 10.09 seconds
üî¢ Estimated tokens: 12.8
‚ö° Output tokens/sec: 1.26


In [8]:
if not IS_KAGGLE:
    %cd /workspace/arc-agi-2025

# Use FIRST_ATTEMPTS and FIRST_WORKERS for initial inference
MAX_ATTEMPTS = FIRST_ATTEMPTS
MAX_WORKERS  = FIRST_WORKERS

# SUBSET = "test" # defaulting to test to ensure there are no loading issues.

# can use this instead if testing evaluation during a pre-run
SUBSET = "test" if IS_RERUN else "evaluation"

# Common env for your runner
os.environ["OPENAI_API_KEY"] = "EMPTY"

print(f"First Inference ‚Üí {'competition' if IS_RERUN else 'dev'} | attempts={MAX_ATTEMPTS} | workers={MAX_WORKERS} | subset={SUBSET}")

# Build the command
cmd_args = [
    "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
    "--dataset", DATASET,
    "--subset", SUBSET,
    "--max_workers", str(MAX_WORKERS),
    "--max_attempts", str(MAX_ATTEMPTS),
    "--model", model_name,
    "--base-url", "http://127.0.0.1:8080/v1",
    "--unsafe-executor",
    "--max-tokens", "2000",
    "--qwen-no-think"
]


# Add parquet output directory if set
if os.getenv("ARC_PROGRAMS_PARQUET"):
  cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])

print(f"Running command: {' '.join(cmd_args)}")

# Handle output redirection properly
if IS_RERUN or not IS_KAGGLE:
    # For quiet mode, redirect to file using subprocess
    import subprocess
    log_file_path = f"{SUBMIT_DIR}/run.log"
    print(f"üìù Logging output to: {log_file_path}")
    
    with open(log_file_path, "w") as log_file:
        process = subprocess.Popen(
            cmd_args,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
            cwd=os.getcwd()
        )
        
        # Wait for completion
        print("‚è≥ Running tasks (output being written to log file)...")
        return_code = process.wait()
        
    if return_code == 0:
        print(f"‚úÖ Task runner completed successfully. Check {log_file_path} for details.")
    else:
        print(f"‚ùå Task runner failed with return code {return_code}")
        print(f"üìù Check {log_file_path} for error details")
        # Show last few lines of log
        !tail -n 20 {log_file_path}
else:
    # For interactive mode, show output directly
    cmd = " ".join(cmd_args)
    print(f"Running: {cmd}\n")
    !{cmd}

First Inference ‚Üí dev | attempts=4 | workers=64 | subset=evaluation
Running command: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 64 --max_attempts 4 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think --parquet-output-dir /kaggle/working
Running: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 64 --max_attempts 4 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think --parquet-output-dir /kaggle/working

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
‚è∞ API timeout: 600s per request (enforced by OpenAI client)
üóÑÔ∏è Sampled programs will be logged to /kaggle/working/20250914_192

In [9]:
print(f"MODEL_PATH type: {type(MODEL_PATH)}, value: {MODEL_PATH}")

MODEL_PATH type: <class 'pathlib.PosixPath'>, value: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic


In [10]:
if ENABLE_REFINEMENT:
  # Restart the server with the (potentially) new model
  if START_SERVER:
      print("üîÑ Restarting inference server with updated model...")

      # Gracefully stop existing server if it exists
      if 'proc' in locals() and proc.poll() is None:  # Check if process is still running
          print("üõë Gracefully stopping existing server...")
          try:
              proc.terminate()  # Send SIGTERM first
              proc.wait(timeout=30)  # Wait up to 30 seconds for graceful shutdown
              print("‚úÖ Server stopped gracefully")
          except subprocess.TimeoutExpired:
              print("‚ö†Ô∏è  Server didn't stop gracefully, force killing...")
              proc.kill()
              proc.wait()
          except Exception as e:
              print(f"‚ö†Ô∏è  Error stopping server: {e}")

      # Wait a bit longer after graceful shutdown
      time.sleep(5)

      # Clear CUDA memory
      try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
              print("‚úÖ CUDA memory cleared")
      except Exception:
          pass

      # Get GPU count
      try:
          import torch
          num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
      except:
          num_gpus = 1

      # Choose which model to use: refinement if available, otherwise original
      model_to_use = REFINEMENT_MODEL_PATH if REFINEMENT_MODEL_PATH else MODEL_PATH
      print(f"üéØ Using model: {model_to_use}")
      print(f"   ‚Üí {'Refinement' if REFINEMENT_MODEL_PATH else 'Original'} model")

      # Restart server with appropriate model
      PORT = 8080
      LOG = f"{SUBMIT_DIR}/sglang_server.log"
      SERVER_CMD = [
          sys.executable, "-m", "sglang.launch_server",
          "--host", "0.0.0.0",
          "--port", str(PORT),
          "--model-path", str(model_to_use),
          "--dp", str(max(1, min(num_gpus, 4))),
          "--kv-cache-dtype", "fp8_e4m3",
          "--enable-metrics",
      ]

      print(f"üöÄ Starting server: {' '.join(SERVER_CMD)}")

      log_f = open(LOG, "a")
      proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT,
                             env=os.environ.copy(), cwd=SUBMIT_DIR)

      print(f"‚úÖ Server started with PID={proc.pid}")

      # Wait for readiness with better error handling
      def wait_ready(url, timeout_s=600):
          t0 = time.time()
          while time.time() - t0 < timeout_s:
              try:
                  r = requests.get(url, timeout=5)
                  if r.status_code == 200:
                      return True
              except Exception:
                  pass
              time.sleep(3)  # Check less frequently
          return False

      HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"
      if wait_ready(HEALTH_URL):
          print("‚úÖ Server ready!")

          # Update model_name
          try:
              response = requests.get(HEALTH_URL)
              if response.status_code == 200:
                  models = response.json()['data']
                  if models:
                      model_name = models[0]['id']
                      print(f"üéØ Model: {model_name}")
          except Exception as e:
              print(f"‚ö†Ô∏è  Could not get model name: {e}")
      else:
          print("‚ùå Server failed to start properly")

üîÑ Restarting inference server with updated model...
üõë Gracefully stopping existing server...
‚úÖ Server stopped gracefully
‚úÖ CUDA memory cleared
üéØ Using model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic
   ‚Üí Refinement model
üöÄ Starting server: /usr/bin/python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic --dp 4 --kv-cache-dtype fp8_e4m3 --enable-metrics
‚úÖ Server started with PID=7341
‚úÖ Server ready!
üéØ Model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic


In [11]:
# Second Inference Run - ENABLE_REFINEMENT
# Only runs when ENABLE_REFINEMENT=true

import glob

if ENABLE_REFINEMENT:
    print("üîÑ Running SECOND inference (ENABLE_REFINEMENT mode)")
    
    if not IS_KAGGLE:
        %cd /workspace/arc-agi-2025

    # Use SECOND_ATTEMPTS and SECOND_WORKERS for second inference
    MAX_ATTEMPTS = SECOND_ATTEMPTS
    MAX_WORKERS  = SECOND_WORKERS

    SUBSET = "test" if IS_RERUN else "evaluation"

    os.environ["OPENAI_API_KEY"] = "EMPTY"

    print(f"ENABLE_REFINEMENT Second Run ‚Üí {'competition' if IS_RERUN else 'dev'} | attempts={MAX_ATTEMPTS} | workers={MAX_WORKERS} | subset={SUBSET}")

    # üîë Find the latest parquet file in inference_dir
    parquet_files = glob.glob(os.path.join(inference_dir, "*.parquet"))
    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {inference_dir}")
    latest_parquet = max(parquet_files, key=os.path.getctime)
    print(f"üìÇ Using latest parquet file for refinement: {latest_parquet}")

    # Build the command
    cmd_args = [
        "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
        "--dataset", DATASET,
        "--subset", SUBSET,
        "--max_workers", str(MAX_WORKERS),
        "--max_attempts", str(MAX_ATTEMPTS),
        "--model", model_name,
        "--base-url", "http://127.0.0.1:8080/v1",
        "--unsafe-executor",
        "--max-tokens", "2000",
        "--qwen-no-think",
        "--refinement-ds", latest_parquet,   # üëà add parquet path here
        "--include-outputs"
    ]

    # Add parquet output directory if set
    if os.getenv("ARC_PROGRAMS_PARQUET"):
      cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])

    print(f"Running ENABLE_REFINEMENT second inference: {' '.join(cmd_args)}")

    # Handle output redirection properly
    if IS_RERUN or not IS_KAGGLE:
        # For quiet mode, redirect to file using subprocess
        import subprocess
        log_file_path = f"{SUBMIT_DIR}/run_second.log"
        print(f"üìù Logging ENABLE_REFINEMENT second run output to: {log_file_path}")
        
        with open(log_file_path, "w") as log_file:
            process = subprocess.Popen(
                cmd_args,
                stdout=log_file,
                stderr=subprocess.STDOUT,
                text=True,
                cwd=os.getcwd()
            )
            
            # Wait for completion
            print("‚è≥ Running ENABLE_REFINEMENT second inference (output being written to log file)...")
            return_code = process.wait()
            
        if return_code == 0:
            print(f"‚úÖ ENABLE_REFINEMENT second inference completed successfully. Check {log_file_path} for details.")
        else:
            print(f"‚ùå ENABLE_REFINEMENT second inference failed with return code {return_code}")
            print(f"üìù Check {log_file_path} for error details")
            # Show last few lines of log
            !tail -n 20 {log_file_path}
    else:
        # For interactive mode, show output directly
        cmd = " ".join(cmd_args)
        print(f"Running ENABLE_REFINEMENT second inference: {cmd}\n")
        !{cmd}

else:
    print("üîÑ Skipping second inference (ENABLE_REFINEMENT=false)")
    print("   ‚Üí Standard mode runs first inference only")

üîÑ Running SECOND inference (ENABLE_REFINEMENT mode)
ENABLE_REFINEMENT Second Run ‚Üí dev | attempts=4 | workers=64 | subset=evaluation
üìÇ Using latest parquet file for refinement: /kaggle/working/20250914_192704__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Soar-qwen-14b-FP8-Dynamic_arc-prize-2025_evaluation.parquet
Running ENABLE_REFINEMENT second inference: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 64 --max_attempts 4 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Soar-qwen-14b-FP8-Dynamic --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think --refinement-ds /kaggle/working/20250914_192704__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Soar-qwen-14b-FP8-Dynamic_arc-prize-2025_evaluation.parquet --include-outputs --parquet-output-dir /kaggle/working
Running ENABLE_REFINEMENT second inference: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2

In [12]:
# Generate submission using the two most recent parquet files
if os.environ.get("SUBMIT", "false").lower() == "true":
    print("üéØ Generating submission from the two most recent parquet files...")
    
    import subprocess
    
    output_dir = str(SUBMIT_DIR)
    
    # Command to generate submission using the two most recent parquet files
    submission_cmd = [
        "uv", "run", "python", "-m", "llm_python.generate_submission",
        "--parquet-path", inference_dir,
        "--n-files", "2",
        "--dataset", DATASET,
        "--subset", SUBSET,
        "--output-dir", output_dir,
        "--debug"
    ]
    
    print(f"Running submission generation: {' '.join(submission_cmd)}")
    print(f"üìÇ Looking for parquet files in: {inference_dir}")
    
    try:
        result = subprocess.run(
            submission_cmd,
            capture_output=True,
            text=True,
            timeout=300,  # 5 minute timeout
            cwd=os.getcwd()
        )
        
        if result.returncode == 0:
            print("‚úÖ Submission generation completed successfully!")
            print(result.stdout)
            
            # Update submit_dir to point to the generated file
            submit_dir = f"{output_dir}/submission.json"
            print(f"üìÅ Submission file: {submit_dir}")
        else:
            print(f"‚ùå Submission generation failed with return code {result.returncode}")
            print(f"STDOUT: {result.stdout}")
            print(f"STDERR: {result.stderr}")
            # Fallback to default submission path
            submit_dir = f"{SUBMIT_DIR}/submission.json"
            
    except subprocess.TimeoutExpired:
        print("‚è±Ô∏è Submission generation timed out")
        submit_dir = f"{SUBMIT_DIR}/submission.json"
    except Exception as e:
        print(f"‚ùå Submission generation error: {e}")
        submit_dir = f"{SUBMIT_DIR}/submission.json"
else:
    print("üìù Skipping submission generation (SUBMIT=false)")
    submit_dir = f"{SUBMIT_DIR}/submission.json"

üéØ Generating submission from the two most recent parquet files...
Running submission generation: uv run python -m llm_python.generate_submission --parquet-path /kaggle/working --n-files 2 --dataset arc-prize-2025 --subset evaluation --output-dir /kaggle/working --debug
üìÇ Looking for parquet files in: /kaggle/working
‚úÖ Submission generation completed successfully!
üîç Selected 2 most recent parquet files from /kaggle/working:
  ‚Ä¢ 20250914_193858__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Soar-qwen-14b-FP8-Dynamic_arc-prize-2025_evaluation.parquet (modified: 2025-09-14 19:48:09)
  ‚Ä¢ 20250914_192704__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Soar-qwen-14b-FP8-Dynamic_arc-prize-2025_evaluation.parquet (modified: 2025-09-14 19:35:42)
‚úÖ Loaded 450 rows from /kaggle/working/20250914_193858__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Soar-qwen-14b-FP8-Dynamic_arc-prize-2025_evaluation.parquet
‚úÖ Loaded 443 rows from /kaggle/working/20250914_192704__kaggle_in

In [13]:
# Only score in dev/commit runs
if SCORE and not IS_RERUN:
    !uv run python -m llm_python.score_submission --submission {submit_dir} --dataset {DATASET} --subset {SUBSET}
else:
    print("Skipping local scoring (competition rerun or SCORE=False).")

üîç Validating submission file: /kaggle/working/submission.json

üîç VALIDATING SUBMISSION: /kaggle/working/submission.json
üìä Validation Results:
  Total tasks: 120
  Total predictions: 172
  Empty predictions ([[0,0],[0,0]]): 0
‚úÖ VALIDATION PASSED - No structural errors found
üéØ Submission file is ready for competition!
üìÇ Loading submission: /kaggle/working/submission.json
üîç Scoring against arc-prize-2025/evaluation
SUBMISSION SCORING RESULTS
Dataset: arc-prize-2025
Subset: evaluation
Reference tasks: 120
Tasks scored: 120
Total predictions: 344

üìä PREDICTION-LEVEL METRICS:
  Pass@1 (first attempt): 1/344 (0.3%)
  Pass@2 (either attempt): 1/344 (0.3%)

üìä TASK-LEVEL METRICS:
  Tasks Pass@1 (all outputs correct on first attempt): 1/120 (0.8%)
  Tasks Pass@2 (all outputs correct on either attempt): 1/120 (0.8%)


In [14]:
# Final cleanup - stop server and free resources
if START_SERVER and 'full_cleanup' in globals():
    print("üßπ Cleaning up server and resources...")
    full_cleanup()
else:
    print("üîç No server cleanup needed (START_SERVER=False or cleanup function not available)")

üßπ Cleaning up server and resources...
Server stopped and CUDA memory cleared.
