In [1]:
import os
from pathlib import Path

## Notes for OpenRouter
# You can run with OpenRouter but must first - it's easiest to set these in Runpod on startup (before kernel startup):
# export CUSTOM_ENDPOINT="https://openrouter.ai"
# export DEV_RUN="true"
# export OPENAI_API_KEY="key-here"

# ============================================================================
# Model Configuration
# ============================================================================

# Single model for both initial inference and refinement
MODEL_HF = "Trelis/Soar-qwen-14b-FP8-Dynamic"  # For local/RunPod
MODEL_KAGGLE = "arc-1-fake-ttt-blended-c802-dataset"  # Kaggle dataset name

# ============================================================================
# Inference Configuration
# ============================================================================

# Sampling attempts (initial inference)
SAMPLING_ATTEMPTS = 256     # Number of attempts for sampling phase
# Refinement attempts (second inference)
REFINEMENT_ATTEMPTS = 256    # Number of attempts for refinement phase

# Global workers setting
MAX_WORKERS = 256            # Number of workers for all inference phases

# ============================================================================
# Other Configuration
# ============================================================================

DATASET = "arc-prize-2025"

# ---- Config flags (single source of truth) ----
# Check for custom endpoint first - if set, disable server startup and use provided endpoint
CUSTOM_ENDPOINT = os.getenv("CUSTOM_ENDPOINT")
START_SERVER = not bool(CUSTOM_ENDPOINT)  # Don't start server if custom endpoint provided

TEST_INFERENCE = True
SCORE = True                   # default; overridden below, depending on flags

# DEV_RUN can be explicitly set via environment variable, or auto-detected for Kaggle non-rerun
DEV_RUN = os.getenv("DEV_RUN", "").lower() == "true"

PRODUCTION_TIMEOUT = 5 * 3600
DEV_TIMEOUT = 120

# Refinement mode?
ENABLE_REFINEMENT=True

# Env-backed flags
IS_KAGGLE = bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
IS_RERUN  = IS_KAGGLE and os.getenv("KAGGLE_IS_COMPETITION_RERUN", "").lower() == "true"

# Auto-detect dev mode for Kaggle non-rerun environments (unless explicitly overridden)
if IS_KAGGLE and not IS_RERUN and not os.getenv("DEV_RUN"):
    DEV_RUN = True
    print("üîß Auto-detected Kaggle development environment (non-rerun)")

# # String env flag for external tools
# os.environ["SUBMIT"] = "true"

# Ensure client key is set
os.environ.setdefault("OPENAI_API_KEY", "EMPTY")

# ---- Paths ----
if IS_KAGGLE:
    ARC_DATA_ROOT   = Path("/kaggle/input")
    MODEL_SAVE_DIR = Path("/kaggle/working")
    SUBMIT_DIR      = Path("/kaggle/working")
    ARC_PROGRAMS_PARQUET = SUBMIT_DIR

    print("üîç Searching for models in Kaggle environment...")

    # Auto-find model path in Kaggle's dataset structure
    model_dataset_path = ARC_DATA_ROOT / MODEL_KAGGLE
    print(f"   Looking for model dataset: {model_dataset_path}")

    if model_dataset_path.exists() and model_dataset_path.is_dir():
        # Kaggle datasets have version folders, find the first subdirectory
        subdirs = [d for d in model_dataset_path.iterdir() if d.is_dir()]
        if subdirs:
            MODEL_PATH = subdirs[0]  # Use the first (usually only) version folder
            print(f"   ‚úÖ Found model at: {MODEL_PATH}")
            # List what's inside to confirm it's right
            model_contents = list(MODEL_PATH.iterdir())[:5]  # Show first 5 items
            print(f"      Contents: {[f.name for f in model_contents]}")
        else:
            # Fallback if no subdirectory found
            MODEL_PATH = model_dataset_path
            print(f"   ‚ö†Ô∏è No version folder found for model, using: {MODEL_PATH}")
    else:
        MODEL_PATH = model_dataset_path
        print(f"   ‚ùå Model dataset not found at: {MODEL_PATH}")
        print(f"      Available datasets: {[d.name for d in ARC_DATA_ROOT.iterdir() if d.is_dir()][:10]}")

    print(f"\nüì¶ Final model path:")
    print(f"   Model: {MODEL_PATH}")

else:
    ARC_DATA_ROOT   = Path("/workspace/arc-agi-2025/data")
    MODEL_SAVE_DIR = Path("/workspace/arc-agi-2025/llm_python/fine-tuning")
    SUBMIT_DIR      = Path("/workspace/arc-agi-2025/llm_python/submissions")
    ARC_PROGRAMS_PARQUET = Path("/workspace/arc-agi-2025/llm_python/datasets/inference")

    # Use local/RunPod model path
    MODEL_PATH = MODEL_HF

    print(f"üì¶ Local/RunPod model path:")
    print(f"   Model: {MODEL_PATH}")

# Set up paths - parquet files are saved by task runner in different locations
if IS_KAGGLE:
    # On Kaggle, parquet files are saved directly in /kaggle/working by task runner
    inference_dir = "/kaggle/working"
else:
    # On RunPod/local, parquet files are saved in llm_python/datasets/inference
    inference_dir = "llm_python/datasets/inference"

# Export envs for downstream processes
os.environ["ARC_DATA_ROOT"]   = str(ARC_DATA_ROOT)
os.environ["MODEL_SAVE_DIR"] = str(MODEL_SAVE_DIR)
os.environ["SUBMIT_DIR"]      = str(SUBMIT_DIR)
os.environ["ARC_PROGRAMS_PARQUET"] = str(ARC_PROGRAMS_PARQUET)
os.environ["MODEL_PATH"] = str(MODEL_PATH)

# Export config flags for subprocess use
os.environ["IS_KAGGLE"] = str(IS_KAGGLE).lower()
os.environ["IS_RERUN"] = str(IS_RERUN).lower()
os.environ["DATASET"] = DATASET

# Ensure directories exist
for p in (MODEL_SAVE_DIR, SUBMIT_DIR):
    p.mkdir(parents=True, exist_ok=True)

# Configure based on environment
if IS_RERUN:
    # Kaggle competition rerun - use full configuration
    print(f"üèÜ Competition rerun detected")
    SCORE = False
    os.environ["SUBMIT"] = "true" # means we don't have test information available for task-wise scoring
    # Full timeouts for competition
    SAMPLING_TIMEOUT = PRODUCTION_TIMEOUT
    REFINEMENT_TIMEOUT = PRODUCTION_TIMEOUT
elif DEV_RUN:
    # Development mode - use reduced timeouts for faster testing
    print(f"üîß Development mode enabled - applying shorter timeouts")
    SCORE = True  # if we're generating a submission, do scoring
    os.environ["SUBMIT"] = "true" # means we don't have test information available for task-wise scoring. Mimic Kaggle environment.
    # Short timeouts for dev testing
    SAMPLING_TIMEOUT = DEV_TIMEOUT
    REFINEMENT_TIMEOUT = DEV_TIMEOUT
else:
    # Production mode (RunPod/local long run) - use full configuration
    print(f"üñ•Ô∏è Production mode (RunPod/local environment)")
    SCORE = True  # if we're generating a submission, do scoring
    os.environ["SUBMIT"] = "false" # means we will use task test outputs to score row by row
    # Full timeouts for production runs
    SAMPLING_TIMEOUT = PRODUCTION_TIMEOUT
    REFINEMENT_TIMEOUT = PRODUCTION_TIMEOUT

# ENABLE_REFINEMENT Mode configuration
if ENABLE_REFINEMENT:
    print("üß™ ENABLE_REFINEMENT ENABLED")
    print("   ‚Üí Will run: Sampling ‚Üí Refinement")
    print(f"   ‚Üí Sampling: {SAMPLING_ATTEMPTS} attempts, {MAX_WORKERS} workers, {SAMPLING_TIMEOUT}s timeout")
    print(f"   ‚Üí Refinement: {REFINEMENT_ATTEMPTS} attempts, {MAX_WORKERS} workers, {REFINEMENT_TIMEOUT}s timeout")
else:
    print("üîÑ Standard mode (ENABLE_REFINEMENT disabled)")
    print(f"   ‚Üí Will run: Sampling only ({SAMPLING_ATTEMPTS} attempts, {MAX_WORKERS} workers, {SAMPLING_TIMEOUT}s timeout)")

# Custom endpoint configuration
if CUSTOM_ENDPOINT:
    model_name = MODEL_HF
    print(f"üîó Using custom endpoint: {CUSTOM_ENDPOINT}")
    print("   ‚Üí Server startup disabled")
else:
    print("üöÄ Will start local server (no custom endpoint specified)")

# Optional: quick summary (helps avoid accidental submits)
print(
    "Mode summary ‚Üí "
    f"IS_KAGGLE={IS_KAGGLE} | IS_RERUN={IS_RERUN} | DEV_RUN={DEV_RUN} | ENABLE_REFINEMENT={ENABLE_REFINEMENT} |\n"
    f"TEST_INFERENCE={TEST_INFERENCE} | SCORE={SCORE} | SUBMIT={os.environ['SUBMIT']} | MODEL={MODEL_PATH}"
)

üì¶ Local/RunPod model path:
   Model: Trelis/Soar-qwen-14b-FP8-Dynamic
üñ•Ô∏è Production mode (RunPod/local environment)
üß™ ENABLE_REFINEMENT ENABLED
   ‚Üí Will run: Sampling ‚Üí Refinement
   ‚Üí Sampling: 256 attempts, 256 workers, 18000s timeout
   ‚Üí Refinement: 256 attempts, 256 workers, 18000s timeout
üöÄ Will start local server (no custom endpoint specified)
Mode summary ‚Üí IS_KAGGLE=False | IS_RERUN=False | DEV_RUN=False | ENABLE_REFINEMENT=True |
TEST_INFERENCE=True | SCORE=True | SUBMIT=false | MODEL=Trelis/Soar-qwen-14b-FP8-Dynamic


In [2]:
import sys
import torch
import numpy as np

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version (PyTorch): {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"NumPy version: {np.__version__}")
if torch.cuda.is_available():
   print(f"GPU count: {torch.cuda.device_count()}")
   print(f"GPU name: {torch.cuda.get_device_name(0)}")

Python version: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
PyTorch version: 2.8.0+cu128
CUDA version (PyTorch): 12.8
CUDA available: True
NumPy version: 2.2.0
GPU count: 1
GPU name: NVIDIA H200


In [3]:
import sglang
print("SGLang version:", sglang.__version__)

try:
    import flashinfer
    print("FlashInfer version:", flashinfer.__version__)
except ImportError:
    print("FlashInfer not installed")

SGLang version: 0.5.1.post2


W0917 11:18:57.957000 24756 torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
W0917 11:18:57.957000 24756 torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.


FlashInfer version: 0.2.14.post1


In [4]:
if IS_KAGGLE:
    import os, shutil, subprocess, stat
    
    # 1) Where to place binaries + cache
    WRK_BIN = "/kaggle/working/bin"
    TRITON_CACHE = "/kaggle/working/.triton"
    os.makedirs(WRK_BIN, exist_ok=True)
    os.makedirs(TRITON_CACHE, exist_ok=True)
    
    # 2) Preferred source for ptxas/cuobjdump/nvdisasm
    SYSTEM_CUDA_BIN = "/usr/local/cuda/bin"
    FALLBACK_VENDORED = "/kaggle/usr/lib/sglang_utility/triton/backends/nvidia/bin"  # if you have it
    
    def copy_tool(name: str):
        for src_dir in (SYSTEM_CUDA_BIN, FALLBACK_VENDORED):
            src = os.path.join(src_dir, name)
            if os.path.exists(src):
                dst = os.path.join(WRK_BIN, name)
                shutil.copy2(src, dst)
                # ensure executable bit
                os.chmod(dst, os.stat(dst).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
                return dst
        raise FileNotFoundError(f"Could not find {name} in {SYSTEM_CUDA_BIN} or {FALLBACK_VENDORED}")
    
    ptxas_path = copy_tool("ptxas")
    try:
        cuobjdump_path = copy_tool("cuobjdump")
    except FileNotFoundError:
        cuobjdump_path = None  # optional
    try:
        nvdisasm_path = copy_tool("nvdisasm")
    except FileNotFoundError:
        nvdisasm_path = None  # optional
    
    # 3) Environment for Triton/JIT
    os.environ["TRITON_PTXAS_PATH"] = ptxas_path
    os.environ["PATH"] = f"{WRK_BIN}:{os.environ.get('PATH','')}"
    os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE
    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["CUDA_PATH"] = "/usr/local/cuda"
    
    # Helpful fallbacks if you still hit capture issues:
    # os.environ["SGLANG_DISABLE_CUDA_GRAPH"] = "1"      # skip CUDA graphs (degrades perf but avoids capture)
    # os.environ["TRITON_CODEGEN_FATBIN"] = "0"          # can reduce Triton fatbin steps on some setups
    
    # 4) Smoke test: ensure ptxas runs from the new location
    print("ptxas ->", subprocess.check_output([ptxas_path, "--version"]).decode().strip())
    
    # Now it's safe to import heavy libs that trigger Triton
    import torch

In [8]:
if START_SERVER:
  # Background server launcher for Kaggle with SGLang
  import os, sys, time, subprocess, json, socket, requests

  # ---------- 1) Check for existing server and cleanup ----------
  PORT = 8080
  HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"

  # Check if server already running
  try:
      r = requests.get(HEALTH_URL, timeout=3)
      if r.status_code == 200:
          print(f"Server already running on port {PORT}. Stopping it first...")
          # Kill existing sglang processes
          subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)
          time.sleep(3)  # Wait for cleanup
  except:
      pass  # No server running

  # Clear CUDA memory before starting
  try:
      import torch
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
          torch.cuda.synchronize()
          print("CUDA memory cleared.")
      num_gpus = torch.cuda.device_count()
  except Exception:
      num_gpus = 0
      
  model_path_to_use = str(MODEL_PATH)
  print(f"üîß Using model from {model_path_to_use}")

  LOG = f"{SUBMIT_DIR}/sglang_server.log"
  print(f"LOG file path: {LOG}")

  SERVER_CMD = [
        sys.executable, "-m", "sglang.launch_server",
        "--host", "0.0.0.0",
        "--port", str(PORT),
        "--model-path", model_path_to_use,
        "--dp", str(max(1, min(num_gpus, 4))),
        "--enable-metrics",
        "--grammar-backend", "none",
  ]
    
  # Add Qwen-specific flag
  if 'qwen' in model_path_to_use.lower():
      SERVER_CMD.extend(["--kv-cache-dtype", "fp8_e4m3"])

  # ---------- 2) Launch in background ----------
  log_f = open(LOG, "w")
  env = os.environ.copy()
  proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT, env=env, cwd=SUBMIT_DIR)
  print(f"Started sglang server PID={proc.pid} | logging to {LOG}")
  print("Command:", " ".join(SERVER_CMD))

  # ---------- 3) Wait for readiness ----------
  def wait_ready(url, timeout_s=600):
      t0 = time.time()
      while time.time() - t0 < timeout_s:
          try:
              r = requests.get(url, timeout=3)
              if r.status_code == 200:
                  return True
          except Exception:
              pass
          time.sleep(2)
      return False

  ready = wait_ready(HEALTH_URL)
  log_f.flush()

  if ready:
      print(f"sglang is READY on port {PORT}.")
      # Get the model name
      try:
          response = requests.get(HEALTH_URL)
          if response.status_code == 200:
              models = response.json()['data']
              if models:
                  model_name = models[0]['id']
                  print(f"‚úÖ Model loaded: {model_name}")
              else:
                  print("‚ùå No models found on server")
                  model_name = str(MODEL_PATH)
          else:
              print(f"‚ùå Server health check failed: {response.status_code}")
              model_name = str(MODEL_PATH)
      except Exception as e:
          print(f"‚ö†Ô∏è Could not get model name: {e}")
          model_name = str(MODEL_PATH)
  else:
      print(f"sglang not ready after timeout. Showing last 60 log lines:")
      log_f.close()
      !tail -n 60 {LOG}
      model_name = str(MODEL_PATH)

  # ---------- 4) Cleanup functions ----------
  def stop_server(p=proc):
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()
      print("Server stopped.")

  def full_cleanup(p=proc):
      # Stop server
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()

      # Also kill any lingering sglang processes
      subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)

      # Clear CUDA memory
      try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
      except:
          pass

      print("Server stopped and CUDA memory cleared.")

  print("Call stop_server() or full_cleanup() to shut it down gracefully.")

CUDA memory cleared.
üîß Using model from Trelis/Soar-qwen-14b-FP8-Dynamic
LOG file path: /workspace/arc-agi-2025/llm_python/submissions/sglang_server.log
Started sglang server PID=24914 | logging to /workspace/arc-agi-2025/llm_python/submissions/sglang_server.log
Command: /workspace/arc-agi-2025/.venv/bin/python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path Trelis/Soar-qwen-14b-FP8-Dynamic --dp 1 --enable-metrics --grammar-backend none --kv-cache-dtype fp8_e4m3
sglang is READY on port 8080.
‚úÖ Model loaded: Trelis/Soar-qwen-14b-FP8-Dynamic
Call stop_server() or full_cleanup() to shut it down gracefully.


In [9]:
if TEST_INFERENCE:
    import time
    import requests
    
    # Use custom endpoint if provided, otherwise use local server
    base_url = CUSTOM_ENDPOINT if CUSTOM_ENDPOINT else "http://127.0.0.1:8080/v1"
    url = f"{base_url}/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
    }
    
    messages = [
        {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
        {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10√ó10 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
    ]
    
    payload = {
        "model": model_name,  # from your polling loop
        "messages": messages,
        # "max_tokens": 1000
        "max_tokens": 10
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload, timeout=600)
    print(response)
    end_time = time.time()
    
    response.raise_for_status()
    result = response.json()
    output_text = result["choices"][0]["message"]["content"]
    
    # Estimate token count (4 chars/token assumption)
    estimated_tokens = len(output_text) / 4
    elapsed_time = end_time - start_time
    tokens_per_second = estimated_tokens / elapsed_time
    
    print("‚úÖ Response received:")
    print(output_text)
    print(f"\n‚è± Elapsed time: {elapsed_time:.2f} seconds")
    print(f"üî¢ Estimated tokens: {estimated_tokens:.1f}")
    print(f"‚ö° Output tokens/sec: {tokens_per_second:.2f}")

<Response [200]>
‚úÖ Response received:
Based on the training examples, the transformation rule appears

‚è± Elapsed time: 0.82 seconds
üî¢ Estimated tokens: 15.8
‚ö° Output tokens/sec: 19.29


In [None]:
if not IS_KAGGLE:
      %cd /workspace/arc-agi-2025

# Use SAMPLING_ATTEMPTS and MAX_WORKERS for initial inference
attempts = SAMPLING_ATTEMPTS
workers = MAX_WORKERS

# SUBSET = "test" # defaulting to test to ensure there are no loading issues.

# can use this instead if testing evaluation during a pre-run
SUBSET = "test" if IS_RERUN else "evaluation"

print(f"Sampling Inference ‚Üí {'competition' if IS_RERUN else 'dev'} | attempts={attempts} | workers={workers} | subset={SUBSET} | timeout={SAMPLING_TIMEOUT}s")

# Use custom endpoint if provided, otherwise use local server
base_url = CUSTOM_ENDPOINT if CUSTOM_ENDPOINT else "http://127.0.0.1:8080/v1"

# Build the command
cmd_args = [
  "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
  "--dataset", DATASET,
  "--subset", SUBSET,
  "--max_workers", str(workers),
  "--max_attempts", str(attempts),
  "--model", model_name,
  "--base-url", base_url,
  "--unsafe-executor",
  "--splitter"
]

if 'qwen' in model_name.lower():
    cmd_args.extend(["--max-tokens", "2000"])
    cmd_args.extend(["--qwen-no-think"])
else:
    cmd_args.extend(["--max-tokens", "64000"])

# Add parquet output directory if set
if os.getenv("ARC_PROGRAMS_PARQUET"):
    cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])

print(f"Running command: {' '.join(cmd_args)}")

# Always use subprocess with timeout (works for all environments)
import subprocess
log_file_path = f"{SUBMIT_DIR}/sampling.log"
print(f"üìù Logging output to: {log_file_path}")

with open(log_file_path, "w") as log_file:
  process = subprocess.Popen(
      cmd_args,
      stdout=log_file,
      stderr=subprocess.STDOUT,
      text=True,
      cwd=os.getcwd()
  )

  # Wait for completion with timeout
  print(f"‚è≥ Running sampling phase (output being written to log file, {SAMPLING_TIMEOUT}s timeout)...")
  try:
      return_code = process.wait(timeout=SAMPLING_TIMEOUT)
      if return_code == 0:
          print(f"‚úÖ Sampling phase completed successfully. Check {log_file_path} for details.")
      else:
          print(f"‚ùå Sampling phase failed with return code {return_code}")
          print(f"üìù Check {log_file_path} for error details")
          # Show last few lines of log
          !tail -n 20 {log_file_path}
  except subprocess.TimeoutExpired:
      print(f"‚è∞ Sampling phase timeout reached ({SAMPLING_TIMEOUT}s) - terminating process")
      process.terminate()
      try:
          # Give it 600 seconds to cleanup gracefully
          process.wait(timeout=600)
          print("‚úÖ Process terminated gracefully - parquet files should be saved")
      except subprocess.TimeoutExpired:
          print("‚ö†Ô∏è Process didn't terminate gracefully, forcing kill")
          process.kill()
          process.wait()

/workspace/arc-agi-2025
Sampling Inference ‚Üí dev | attempts=256 | workers=256 | subset=evaluation | timeout=18000s
Running command: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 256 --max_attempts 256 --model Trelis/Soar-qwen-14b-FP8-Dynamic --base-url http://127.0.0.1:8080/v1 --unsafe-executor --splitter --max-tokens 2000 --qwen-no-think --parquet-output-dir /workspace/arc-agi-2025/llm_python/datasets/inference
üìù Logging output to: /workspace/arc-agi-2025/llm_python/submissions/sampling.log
‚è≥ Running sampling phase (output being written to log file, 18000s timeout)...


In [None]:
if ENABLE_REFINEMENT:
    print("üîÑ Checking if server restart is needed...")
    
    # Since we're using the same model for both initial and refinement inference,
    # we don't need to restart the server
    print("‚úÖ Using same model for refinement - no server restart needed")
    print(f"üéØ Continuing with existing model: {MODEL_PATH}")
    
    # Just verify the server is still running and get the model name
    if START_SERVER:
        try:
            HEALTH_URL = "http://127.0.0.1:8080/v1/models"
            response = requests.get(HEALTH_URL, timeout=5)
            if response.status_code == 200:
                models = response.json()['data']
                if models:
                    model_name = models[0]['id']
                    print(f"‚úÖ Server is running with model: {model_name}")
                else:
                    print("‚ùå No models found on server")
            else:
                print(f"‚ùå Server health check failed: {response.status_code}")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not verify server status: {e}")
            print("‚ùå Server may not be running properly")
    else:
        print("‚ÑπÔ∏è START_SERVER is False - assuming server is managed externally")

In [None]:
# Second Inference Run - ENABLE_REFINEMENT
# Only runs when ENABLE_REFINEMENT=true

import glob

if ENABLE_REFINEMENT:
    print("üîÑ Running refinement inference (ENABLE_REFINEMENT mode)")
    
    if not IS_KAGGLE:
        %cd /workspace/arc-agi-2025

    # Use REFINEMENT_ATTEMPTS and MAX_WORKERS for second inference
    attempts = REFINEMENT_ATTEMPTS
    workers = MAX_WORKERS

    SUBSET = "test" if IS_RERUN else "evaluation"

    print(f"Refinement Phase ‚Üí {'competition' if IS_RERUN else 'dev'} | attempts={attempts} | workers={workers} | subset={SUBSET} | timeout={REFINEMENT_TIMEOUT}s")

    # üîë Find the latest parquet file in inference_dir
    parquet_files = glob.glob(os.path.join(inference_dir, "*.parquet"))
    if not parquet_files:
        raise FileNotFoundError(f"No parquet files found in {inference_dir}")
    latest_parquet = max(parquet_files, key=os.path.getctime)
    print(f"üìÇ Using latest parquet file for refinement: {latest_parquet}")

    # Use custom endpoint if provided, otherwise use local server
    base_url = CUSTOM_ENDPOINT if CUSTOM_ENDPOINT else "http://127.0.0.1:8080/v1"

    # Build the command
    cmd_args = [
        "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
        "--dataset", DATASET,
        "--subset", SUBSET,
        "--max_workers", str(workers),
        "--max_attempts", str(attempts),
        "--model", model_name,
        "--base-url", base_url,
        "--unsafe-executor",
        "--refinement-ds", latest_parquet,   # üëà add parquet path here
    ]

    if 'qwen' in model_name.lower():
        cmd_args.extend(["--max-tokens", "2000"])
        cmd_args.extend(["--qwen-no-think"])
    else:
        cmd_args.extend(["--max-tokens", "64000"])

    # Add parquet output directory if set
    if os.getenv("ARC_PROGRAMS_PARQUET"):
      cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])
      cmd_args.extend(["--rex-stats"])

    print(f"Running refinement inference: {' '.join(cmd_args)}")

    # Handle output redirection properly
    # For quiet mode, redirect to file using subprocess
    import subprocess
    log_file_path = f"{SUBMIT_DIR}/refinement.log"
    print(f"üìù Logging refinement phase output to: {log_file_path}")
    
    with open(log_file_path, "w") as log_file:
        process = subprocess.Popen(
            cmd_args,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
            cwd=os.getcwd()
        )
        
        # Wait for completion with timeout
        print(f"‚è≥ Running refinement phase (output being written to log file, {REFINEMENT_TIMEOUT}s timeout)...")
        try:
            return_code = process.wait(timeout=REFINEMENT_TIMEOUT)
            if return_code == 0:
                print(f"‚úÖ Refinement phase completed successfully. Check {log_file_path} for details.")
            else:
                print(f"‚ùå Refinement phase failed with return code {return_code}")
                print(f"üìù Check {log_file_path} for error details")
                # Show last few lines of log
                !tail -n 20 {log_file_path}
        except subprocess.TimeoutExpired:
            print(f"‚è∞ Refinement phase timeout reached ({REFINEMENT_TIMEOUT}s) - terminating process")
            process.terminate()
            try:
                # Give it 30 seconds to cleanup gracefully
                process.wait(timeout=30)
                print("‚úÖ Process terminated gracefully - parquet files should be saved")
            except subprocess.TimeoutExpired:
                print("‚ö†Ô∏è Process didn't terminate gracefully, forcing kill")
                process.kill()
                process.wait()

else:
    print("üîÑ Skipping refinement phase (ENABLE_REFINEMENT=false)")
    print("   ‚Üí Standard mode runs sampling phase only")

In [None]:
# Generate submission using the two most recent parquet files

print("üéØ Generating submission from the two most recent parquet files...")

import subprocess

output_dir = str(SUBMIT_DIR)

# Command to generate submission using the two most recent parquet files
submission_cmd = [
    "uv", "run", "python", "-m", "llm_python.generate_submission",
    "--parquet-path", inference_dir,
    "--n-files", "2",
    "--dataset", DATASET,
    "--subset", SUBSET,
    "--output-dir", output_dir,
    "--debug"
]

print(f"Running submission generation: {' '.join(submission_cmd)}")
print(f"üìÇ Looking for parquet files in: {inference_dir}")

try:
    result = subprocess.run(
        submission_cmd,
        capture_output=True,
        text=True,
        timeout=300,  # 5 minute timeout
        cwd=os.getcwd()
    )
    
    if result.returncode == 0:
        print("‚úÖ Submission generation completed successfully!")
        print(result.stdout)
        
        # Update submit_dir to point to the generated file
        submit_dir = f"{output_dir}/submission.json"
        print(f"üìÅ Submission file: {submit_dir}")
    else:
        print(f"‚ùå Submission generation failed with return code {result.returncode}")
        print(f"STDOUT: {result.stdout}")
        print(f"STDERR: {result.stderr}")
        # Fallback to default submission path
        submit_dir = f"{SUBMIT_DIR}/submission.json"
        
except subprocess.TimeoutExpired:
    print("‚è±Ô∏è Submission generation timed out")
    submit_dir = f"{SUBMIT_DIR}/submission.json"
except Exception as e:
    print(f"‚ùå Submission generation error: {e}")
    submit_dir = f"{SUBMIT_DIR}/submission.json"

In [None]:
# Only score in dev/commit runs
if SCORE and not IS_RERUN:
    !uv run python -m llm_python.score_submission --submission {submit_dir} --dataset {DATASET} --subset {SUBSET}
else:
    print("Skipping local scoring (competition rerun or SCORE=False).")

In [None]:
# Final cleanup - stop server and free resources
if START_SERVER and 'full_cleanup' in globals():
    print("üßπ Cleaning up server and resources...")
    full_cleanup()
else:
    print("üîç No server cleanup needed (START_SERVER=False or cleanup function not available)")