In [None]:
import os
from pathlib import Path

## Notes for OpenRouter
# You can run with OpenRouter but must first - it's easiest to set these in Runpod on startup (before kernel startup):
# export CUSTOM_ENDPOINT="https://openrouter.ai"
# export DEV_RUN="true"
# export OPENAI_API_KEY="key-here"

# ============================================================================
# Model Configuration
# ============================================================================

# Single model for both initial inference and refinement
MODEL_HF = "openai/gpt-oss-20b"  # For local/RunPod
# MODEL_HF = "Trelis/Soar-qwen-14b-FP8-Dynamic"  # For local/RunPod
MODEL_KAGGLE = "arc-1-fake-ttt-blended-c802-dataset"  # Kaggle dataset name

# ============================================================================
# Inference Configuration
# ============================================================================

# Sampling attempts (initial inference)
SAMPLING_ATTEMPTS = 8     # Number of attempts for sampling phase
# Refinement attempts (second inference)
REFINEMENT_ATTEMPTS = 8    # Number of attempts for refinement phase

# Number of refinement phases
R = 4  # Number of refinement phases
REFINEMENT_ATTEMPTS_PER_PHASE = REFINEMENT_ATTEMPTS // R

# Global workers setting
MAX_WORKERS = 32            # Number of workers for all inference phases

# ============================================================================
# Other Configuration
# ============================================================================

DATASET = "arc-prize-2025"

# ---- Config flags (single source of truth) ----
# Check for custom endpoint first - if set, disable server startup and use provided endpoint
CUSTOM_ENDPOINT = os.getenv("CUSTOM_ENDPOINT")
START_SERVER = not bool(CUSTOM_ENDPOINT)  # Don't start server if custom endpoint provided

TEST_INFERENCE = True
SCORE = True                   # default; overridden below, depending on flags

# DEV_RUN can be explicitly set via environment variable, or auto-detected for Kaggle non-rerun
DEV_RUN = os.getenv("DEV_RUN", "").lower() == "true"

PRODUCTION_TIMEOUT = 5 * 3600
DEV_TIMEOUT = 120

# Configure timeouts based on environment - IMPORTANT: Define before refinement loop
REFINEMENT_TIMEOUT_PER_PHASE = int(PRODUCTION_TIMEOUT / R)  # Will be set below

# Refinement mode?
ENABLE_REFINEMENT=True

# Env-backed flags
IS_KAGGLE = bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
IS_RERUN  = IS_KAGGLE and os.getenv("KAGGLE_IS_COMPETITION_RERUN", "").lower() == "true"

# Auto-detect dev mode for Kaggle non-rerun environments (unless explicitly overridden)
if IS_KAGGLE and not IS_RERUN and not os.getenv("DEV_RUN"):
    DEV_RUN = True
    print("üîß Auto-detected Kaggle development environment (non-rerun)")

# # String env flag for external tools
# os.environ["SUBMIT"] = "true"

# Ensure client key is set
os.environ.setdefault("OPENAI_API_KEY", "EMPTY")

# ---- Paths ----
if IS_KAGGLE:
    ARC_DATA_ROOT   = Path("/kaggle/input")
    MODEL_SAVE_DIR = Path("/kaggle/working")
    SUBMIT_DIR      = Path("/kaggle/working")
    ARC_PROGRAMS_PARQUET = SUBMIT_DIR

    print("üîç Searching for models in Kaggle environment...")

    # Auto-find model path in Kaggle's dataset structure
    model_dataset_path = ARC_DATA_ROOT / MODEL_KAGGLE
    print(f"   Looking for model dataset: {model_dataset_path}")

    if model_dataset_path.exists() and model_dataset_path.is_dir():
        # Kaggle datasets have version folders, find the first subdirectory
        subdirs = [d for d in model_dataset_path.iterdir() if d.is_dir()]
        if subdirs:
            MODEL_PATH = subdirs[0]  # Use the first (usually only) version folder
            print(f"   ‚úÖ Found model at: {MODEL_PATH}")
            # List what's inside to confirm it's right
            model_contents = list(MODEL_PATH.iterdir())[:5]  # Show first 5 items
            print(f"      Contents: {[f.name for f in model_contents]}")
        else:
            # Fallback if no subdirectory found
            MODEL_PATH = model_dataset_path
            print(f"   ‚ö†Ô∏è No version folder found for model, using: {MODEL_PATH}")
    else:
        MODEL_PATH = model_dataset_path
        print(f"   ‚ùå Model dataset not found at: {MODEL_PATH}")
        print(f"      Available datasets: {[d.name for d in ARC_DATA_ROOT.iterdir() if d.is_dir()][:10]}")

    print(f"\nüì¶ Final model path:")
    print(f"   Model: {MODEL_PATH}")

else:
    ARC_DATA_ROOT   = Path("/workspace/arc-agi-2025/data")
    MODEL_SAVE_DIR = Path("/workspace/arc-agi-2025/llm_python/fine-tuning")
    SUBMIT_DIR      = Path("/workspace/arc-agi-2025/llm_python/submissions")
    ARC_PROGRAMS_PARQUET = Path("/workspace/arc-agi-2025/llm_python/datasets/inference")

    # Use local/RunPod model path
    MODEL_PATH = MODEL_HF

    print(f"üì¶ Local/RunPod model path:")
    print(f"   Model: {MODEL_PATH}")

# Set up paths - parquet files are saved by task runner in different locations
if IS_KAGGLE:
    # On Kaggle, parquet files are saved directly in /kaggle/working by task runner
    inference_dir = "/kaggle/working"
else:
    # On RunPod/local, parquet files are saved in llm_python/datasets/inference
    inference_dir = "llm_python/datasets/inference"

# Export envs for downstream processes
os.environ["ARC_DATA_ROOT"]   = str(ARC_DATA_ROOT)
os.environ["MODEL_SAVE_DIR"] = str(MODEL_SAVE_DIR)
os.environ["SUBMIT_DIR"]      = str(SUBMIT_DIR)
os.environ["ARC_PROGRAMS_PARQUET"] = str(ARC_PROGRAMS_PARQUET)
os.environ["MODEL_PATH"] = str(MODEL_PATH)

# Export config flags for subprocess use
os.environ["IS_KAGGLE"] = str(IS_KAGGLE).lower()
os.environ["IS_RERUN"] = str(IS_RERUN).lower()
os.environ["DATASET"] = DATASET

# Ensure directories exist
for p in (MODEL_SAVE_DIR, SUBMIT_DIR):
    p.mkdir(parents=True, exist_ok=True)

if IS_RERUN:
    # Kaggle competition rerun - use full configuration
    print(f"üèÜ Competition rerun detected")
    SCORE = False
    os.environ["SUBMIT"] = "true" # means we don't have test information available for task-wise scoring
    # Full timeouts for competition
    SAMPLING_TIMEOUT = PRODUCTION_TIMEOUT
    REFINEMENT_TIMEOUT_PER_PHASE = PRODUCTION_TIMEOUT // R  # Each refinement phase gets 1/R of production timeout
elif DEV_RUN:
    # Development mode - use reduced timeouts for faster testing
    print(f"üîß Development mode enabled - applying shorter timeouts")
    SCORE = True  # if we're generating a submission, do scoring
    os.environ["SUBMIT"] = "true" # means we don't have test information available for task-wise scoring. Mimic Kaggle environment.
    # Short timeouts for dev testing
    SAMPLING_TIMEOUT = DEV_TIMEOUT
    REFINEMENT_TIMEOUT_PER_PHASE = DEV_TIMEOUT // R  # Each refinement phase gets 1/R of dev timeout
else:
    # Production mode (RunPod/local long run) - use full configuration
    print(f"üñ•Ô∏è Production mode (RunPod/local environment)")
    SCORE = True  # if we're generating a submission, do scoring
    os.environ["SUBMIT"] = "false" # means we will use task test outputs to score row by row
    # Full timeouts for production runs
    SAMPLING_TIMEOUT = PRODUCTION_TIMEOUT
    REFINEMENT_TIMEOUT_PER_PHASE = PRODUCTION_TIMEOUT // R  # Each refinement phase gets 1/R of production timeout

# ENABLE_REFINEMENT Mode configuration
if ENABLE_REFINEMENT:
    print("üß™ ENABLE_REFINEMENT ENABLED")
    print("   ‚Üí Will run: Sampling ‚Üí Refinement")
    print(f"   ‚Üí Sampling: {SAMPLING_ATTEMPTS} attempts, {MAX_WORKERS} workers, {SAMPLING_TIMEOUT}s timeout")
    print(f"   ‚Üí Refinement: {R} phases √ó {REFINEMENT_ATTEMPTS_PER_PHASE} attempts, {MAX_WORKERS} workers, {REFINEMENT_TIMEOUT_PER_PHASE}s timeout each")
else:
    print("üîÑ Standard mode (ENABLE_REFINEMENT disabled)")
    print(f"   ‚Üí Will run: Sampling only ({SAMPLING_ATTEMPTS} attempts, {MAX_WORKERS} workers, {SAMPLING_TIMEOUT}s timeout)")

# Custom endpoint configuration
if CUSTOM_ENDPOINT:
    model_name = MODEL_HF
    print(f"üîó Using custom endpoint: {CUSTOM_ENDPOINT}")
    print("   ‚Üí Server startup disabled")
else:
    print("üöÄ Will start local server (no custom endpoint specified)")

# Optional: quick summary (helps avoid accidental submits)
print(
    "Mode summary ‚Üí "
    f"IS_KAGGLE={IS_KAGGLE} | IS_RERUN={IS_RERUN} | DEV_RUN={DEV_RUN} | ENABLE_REFINEMENT={ENABLE_REFINEMENT} |\n"
    f"TEST_INFERENCE={TEST_INFERENCE} | SCORE={SCORE} | SUBMIT={os.environ['SUBMIT']} | MODEL={MODEL_PATH}"
)

In [None]:
import sys
import torch
import numpy as np

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version (PyTorch): {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"NumPy version: {np.__version__}")
if torch.cuda.is_available():
   print(f"GPU count: {torch.cuda.device_count()}")
   print(f"GPU name: {torch.cuda.get_device_name(0)}")

In [None]:
import sglang
print("SGLang version:", sglang.__version__)

try:
    import flashinfer
    print("FlashInfer version:", flashinfer.__version__)
except ImportError:
    print("FlashInfer not installed")

In [None]:
if IS_KAGGLE:
    import os, shutil, subprocess, stat
    
    # 1) Where to place binaries + cache
    WRK_BIN = "/kaggle/working/bin"
    TRITON_CACHE = "/kaggle/working/.triton"
    os.makedirs(WRK_BIN, exist_ok=True)
    os.makedirs(TRITON_CACHE, exist_ok=True)
    
    # 2) Preferred source for ptxas/cuobjdump/nvdisasm
    SYSTEM_CUDA_BIN = "/usr/local/cuda/bin"
    FALLBACK_VENDORED = "/kaggle/usr/lib/sglang_utility/triton/backends/nvidia/bin"  # if you have it
    
    def copy_tool(name: str):
        for src_dir in (SYSTEM_CUDA_BIN, FALLBACK_VENDORED):
            src = os.path.join(src_dir, name)
            if os.path.exists(src):
                dst = os.path.join(WRK_BIN, name)
                shutil.copy2(src, dst)
                # ensure executable bit
                os.chmod(dst, os.stat(dst).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
                return dst
        raise FileNotFoundError(f"Could not find {name} in {SYSTEM_CUDA_BIN} or {FALLBACK_VENDORED}")
    
    ptxas_path = copy_tool("ptxas")
    try:
        cuobjdump_path = copy_tool("cuobjdump")
    except FileNotFoundError:
        cuobjdump_path = None  # optional
    try:
        nvdisasm_path = copy_tool("nvdisasm")
    except FileNotFoundError:
        nvdisasm_path = None  # optional
    
    # 3) Environment for Triton/JIT
    os.environ["TRITON_PTXAS_PATH"] = ptxas_path
    os.environ["PATH"] = f"{WRK_BIN}:{os.environ.get('PATH','')}"
    os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE
    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["CUDA_PATH"] = "/usr/local/cuda"
    
    # Helpful fallbacks if you still hit capture issues:
    # os.environ["SGLANG_DISABLE_CUDA_GRAPH"] = "1"      # skip CUDA graphs (degrades perf but avoids capture)
    # os.environ["TRITON_CODEGEN_FATBIN"] = "0"          # can reduce Triton fatbin steps on some setups
    
    # 4) Smoke test: ensure ptxas runs from the new location
    print("ptxas ->", subprocess.check_output([ptxas_path, "--version"]).decode().strip())
    
    # Now it's safe to import heavy libs that trigger Triton
    import torch

In [None]:
if START_SERVER:
  # Multi-GPU server launcher - one server per GPU
  import os, sys, time, subprocess, json, socket, requests

  # Ensure log/output directory exists
  os.makedirs(SUBMIT_DIR, exist_ok=True)  # <<<

  # Get GPU count
  try:
      import torch
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
          torch.cuda.synchronize()
          print("CUDA memory cleared.")
      num_gpus = torch.cuda.device_count()
  except Exception:
      num_gpus = 1
      
  print(f"üî• Detected {num_gpus} GPU(s) - starting {num_gpus} servers")

  # Kill any existing sglang processes
  subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)
  time.sleep(3)

  model_path_to_use = str(MODEL_PATH)
  print(f"üîß Using model from {model_path_to_use}")

  servers = []  # Track all server processes
  base_port = 8080

  # Start one server per GPU
  for gpu_id in range(num_gpus):
      port = base_port + gpu_id
      log_file_path = f"{SUBMIT_DIR}/sglang_server_gpu{gpu_id}.log"
      print(f"GPU {gpu_id} ‚Üí port {port} ‚Üí log {log_file_path}")

      # Environment with specific GPU visibility
      env = os.environ.copy()
      env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
      env["PYTHONUNBUFFERED"] = "1"  # <<< force unbuffered logs from child too

      # Debug: Print environment
      print(f"   CUDA_VISIBLE_DEVICES={env['CUDA_VISIBLE_DEVICES']}")

      SERVER_CMD = [
            sys.executable, "-u", "-m", "sglang.launch_server",
            "--host", "0.0.0.0",
            "--port", str(port),
            "--model-path", model_path_to_use,
            "--dp", "1",                    # <<< one process sees one GPU
            "--enable-metrics",
            "--grammar-backend", "none",
      ]
        
      # Add Qwen-specific flag
      if 'qwen' in model_path_to_use.lower():
          SERVER_CMD.extend(["--kv-cache-dtype", "fp8_e4m3"])

      print(f"   Command: {' '.join(SERVER_CMD)}")

      # Launch server with explicit error handling
      try:
          log_f = open(log_file_path, "w")
          log_f.write(f"Starting SGLang server for GPU {gpu_id}\n")
          log_f.write(f"Command: {' '.join(SERVER_CMD)}\n")
          log_f.write(f"CUDA_VISIBLE_DEVICES: {env['CUDA_VISIBLE_DEVICES']}\n")
          log_f.write("=" * 50 + "\n")
          log_f.flush()

          proc = subprocess.Popen(
              SERVER_CMD, 
              stdout=log_f, 
              stderr=subprocess.STDOUT, 
              env=env, 
              cwd=SUBMIT_DIR
          )

          time.sleep(2)
          
          poll_result = proc.poll()
          if poll_result is not None:
              print(f"‚ùå GPU {gpu_id} server process exited immediately with code {poll_result}")
              log_f.write(f"Process exited immediately with code {poll_result}\n")
              log_f.flush()  # <<< ensure you see the exit line
              log_f.close()
              continue
          else:
              print(f"‚úÖ GPU {gpu_id} server process started successfully (PID {proc.pid})")
      
      except Exception as e:
          print(f"‚ùå Failed to start GPU {gpu_id} server: {e}")
          try:
              log_f.write(f"Failed to start server: {e}\n")
              log_f.flush()  # <<<
              log_f.close()
          except:
              pass
          continue
      
      servers.append({
          'gpu_id': gpu_id,
          'port': port,
          'process': proc,
          'log_file': log_f,
          'log_path': log_file_path,
          'health_url': f"http://127.0.0.1:{port}/v1/models"
      })

  if not servers:
      print("‚ùå No servers started successfully!")
      raise RuntimeError("Failed to start any servers")

  print(f"üîÑ Started {len(servers)} servers, waiting for readiness...")

  # Wait for all servers to be ready
  def wait_ready(url, timeout_s=600):
      t0 = time.time()
      while time.time() - t0 < timeout_s:
          try:
              r = requests.get(url, timeout=3)
              if r.status_code == 200:
                  return True
          except Exception:
              pass
          time.sleep(2)
      return False

  print("üîÑ Waiting for all servers to be ready...")
  ready_servers = []
  
  for server in servers:
      print(f"Checking GPU {server['gpu_id']} on port {server['port']}...")
      if wait_ready(server['health_url']):
          print(f"‚úÖ GPU {server['gpu_id']} server ready on port {server['port']}")
          
          # Get model name
          try:
              response = requests.get(server['health_url'])
              if response.status_code == 200:
                  models = response.json()['data']
                  if models:
                      model_name = models[0]['id']
                      server['model_name'] = model_name
                      print(f"   Model loaded: {model_name}")
                  else:
                      server['model_name'] = str(MODEL_PATH)
              else:
                  server['model_name'] = str(MODEL_PATH)
          except Exception as e:
              print(f"‚ö†Ô∏è Could not get model name for GPU {server['gpu_id']}: {e}")
              server['model_name'] = str(MODEL_PATH)
          
          ready_servers.append(server)
      else:
          print(f"‚ùå GPU {server['gpu_id']} server failed to start - check {server['log_path']}")
          # Check if process is still running
          if server['process'].poll() is not None:
              print(f"   Process exited with code {server['process'].poll()}")
          server['log_file'].close()

  # Update servers list to only include ready servers
  servers = ready_servers
  
  if not servers:
      print("‚ùå No servers started successfully!")
      raise RuntimeError("Failed to start any servers")
  
  print(f"üöÄ {len(servers)} servers ready! GPU ports: {[s['port'] for s in servers]}")

  # Use first server's model name for compatibility
  model_name = servers[0]['model_name']

  # Cleanup functions
  def stop_all_servers():
      for server in servers:
          try:
              server['process'].terminate()
              server['process'].wait(timeout=10)
              server['log_file'].close()
          except Exception:
              server['process'].kill()
      print("All servers stopped.")

  def full_cleanup():
      # Stop all servers
      for server in servers:
          try:
              server['process'].terminate()
              server['process'].wait(timeout=10)
              server['log_file'].close()
          except Exception:
              server['process'].kill()

      # Kill any lingering sglang processes
      subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)

      # Clear CUDA memory on all GPUs
      try:
          import torch
          if torch.cuda.is_available():
              for gpu_id in range(torch.cuda.device_count()):
                  torch.cuda.set_device(gpu_id)
                  torch.cuda.empty_cache()
                  torch.cuda.synchronize()
      except:
          pass

      print("All servers stopped and CUDA memory cleared.")

  print("Call stop_all_servers() or full_cleanup() to shut down all servers.")

In [None]:
if TEST_INFERENCE:
    import time
    import requests
    
    # Use custom endpoint if provided, otherwise use local server
    base_url = CUSTOM_ENDPOINT if CUSTOM_ENDPOINT else "http://127.0.0.1:8080/v1"
    url = f"{base_url}/chat/completions"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
    }
    
    messages = [
        {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
        {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10√ó10 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
    ]
    
    payload = {
        "model": model_name,  # from your polling loop
        "messages": messages,
        # "max_tokens": 1000
        "max_tokens": 10
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload, timeout=600)
    print(response)
    end_time = time.time()
    
    response.raise_for_status()
    result = response.json()
    output_text = result["choices"][0]["message"]["content"]
    
    # Estimate token count (4 chars/token assumption)
    estimated_tokens = len(output_text) / 4
    elapsed_time = end_time - start_time
    tokens_per_second = estimated_tokens / elapsed_time
    
    print("‚úÖ Response received:")
    print(output_text)
    print(f"\n‚è± Elapsed time: {elapsed_time:.2f} seconds")
    print(f"üî¢ Estimated tokens: {estimated_tokens:.1f}")
    print(f"‚ö° Output tokens/sec: {tokens_per_second:.2f}")

In [None]:
if not IS_KAGGLE:
      %cd /workspace/arc-agi-2025

# Multi-GPU parallel sampling
if 'servers' not in globals() or not servers:
    print("‚ùå No servers available - falling back to single GPU mode")
    # Fallback to original single server approach
    base_url = CUSTOM_ENDPOINT if CUSTOM_ENDPOINT else "http://127.0.0.1:8080/v1"
    servers_to_use = [{'port': 8080, 'gpu_id': 0}]
    num_gpus = 1
else:
    servers_to_use = servers
    num_gpus = len(servers)

print(f"üî• Running sampling phase across {num_gpus} GPU(s)")

# Split attempts across GPUs
attempts_per_gpu = SAMPLING_ATTEMPTS // num_gpus
remainder_attempts = SAMPLING_ATTEMPTS % num_gpus

SUBSET = "test" if IS_RERUN else "evaluation"

print(f"Sampling Inference ‚Üí {'competition' if IS_RERUN else 'dev'} | total_attempts={SAMPLING_ATTEMPTS} | {num_gpus} GPUs | subset={SUBSET} | timeout={SAMPLING_TIMEOUT}s")

sampling_processes = []

# Launch one task runner per GPU
for i, server_info in enumerate(servers_to_use):
    gpu_id = server_info.get('gpu_id', i)
    port = server_info['port']

    # Add delay between launches to ensure unique parquet filenames
    if i > 0:
        print(f"‚è±Ô∏è Waiting 3 seconds before launching GPU {gpu_id} to ensure unique timestamp...")
        time.sleep(3)
    
    # Distribute attempts (give remainder to first GPUs)
    attempts = attempts_per_gpu + (1 if i < remainder_attempts else 0)
    workers = MAX_WORKERS  # Each runner uses full worker count
    
    base_url = f"http://127.0.0.1:{port}/v1"
    
    print(f"GPU {gpu_id}: {attempts} attempts ‚Üí port {port}")

    # Build the command for this GPU
    cmd_args = [
      "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
      "--dataset", DATASET,
      "--subset", SUBSET,
      "--max_workers", str(workers),
      "--max_attempts", str(attempts),
      "--model", model_name,
      "--base-url", base_url,
      "--unsafe-executor",
      "--single"
    ]

    if 'qwen' in model_name.lower():
        cmd_args.extend(["--max-tokens", "2000"])
        cmd_args.extend(["--qwen-no-think"])
    else:
        cmd_args.extend(["--max-tokens", "64000"])

    # Add parquet output directory if set
    if os.getenv("ARC_PROGRAMS_PARQUET"):
        cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])

    log_file_path = f"{SUBMIT_DIR}/sampling_gpu{gpu_id}.log"
    print(f"   Logging to: {log_file_path}")

    # Launch subprocess
    log_file = open(log_file_path, "w")
    process = subprocess.Popen(
        cmd_args,
        stdout=log_file,
        stderr=subprocess.STDOUT,
        text=True,
        cwd=os.getcwd()
    )
    
    sampling_processes.append({
        'gpu_id': gpu_id,
        'process': process,
        'log_file': log_file,
        'log_path': log_file_path,
        'attempts': attempts,
        'port': port
    })

print(f"üöÄ Launched {len(sampling_processes)} parallel sampling processes")
print("Command example:", " ".join(cmd_args))

# Wait for all sampling processes to complete concurrently
import time
start_time = time.time()
completed_processes = []
failed_processes = []
remaining_processes = sampling_processes.copy()

print(f"‚è≥ Waiting for all {len(sampling_processes)} sampling processes to complete...")

while remaining_processes and (time.time() - start_time) < SAMPLING_TIMEOUT:
    for proc_info in remaining_processes.copy():
        gpu_id = proc_info['gpu_id']
        poll_result = proc_info['process'].poll()

        if poll_result is not None:  # Process has finished
            remaining_processes.remove(proc_info)
            proc_info['log_file'].close()

            if poll_result == 0:
                print(f"‚úÖ GPU {gpu_id} sampling completed naturally ({proc_info['attempts']} attempts)")
                completed_processes.append(proc_info)
            else:
                print(f"‚ùå GPU {gpu_id} sampling failed with return code {poll_result}")
                print(f"üìù Check {proc_info['log_path']} for details")
                failed_processes.append(proc_info)

    if remaining_processes:
        time.sleep(1)  # Check every second

# Handle any processes that timed out - treat as successful
for proc_info in remaining_processes:
    gpu_id = proc_info['gpu_id']
    print(f"‚è∞ GPU {gpu_id} sampling timeout reached ({SAMPLING_TIMEOUT}s) - terminating")
    proc_info['process'].terminate()
    try:
        proc_info['process'].wait(timeout=30)
        print(f"‚úÖ GPU {gpu_id} process terminated gracefully")
    except subprocess.TimeoutExpired:
        print(f"‚ö†Ô∏è GPU {gpu_id} process didn't terminate gracefully, forcing kill")
        proc_info['process'].kill()
        proc_info['process'].wait()

    proc_info['log_file'].close()
    # Treat timeout as successful completion
    print(f"‚úÖ GPU {gpu_id} sampling completed (timeout) ({proc_info['attempts']} attempts)")
    completed_processes.append(proc_info)

print(f"üìä Sampling Results: {len(completed_processes)} succeeded, {len(failed_processes)} failed")

if not completed_processes:
    print("‚ùå All sampling processes failed!")
    raise RuntimeError("No sampling processes completed successfully")
else:
    total_completed_attempts = sum(p['attempts'] for p in completed_processes)
    print(f"‚úÖ Successfully completed {total_completed_attempts} total attempts across {len(completed_processes)} GPUs")

In [None]:
if ENABLE_REFINEMENT:
    print("üîÑ Checking if server restart is needed...")
    
    # Since we're using the same model for both initial and refinement inference,
    # we don't need to restart the server
    print("‚úÖ Using same model for refinement - no server restart needed")
    print(f"üéØ Continuing with existing model: {MODEL_PATH}")
    
    # Just verify the server is still running and get the model name
    if START_SERVER:
        try:
            HEALTH_URL = "http://127.0.0.1:8080/v1/models"
            response = requests.get(HEALTH_URL, timeout=5)
            if response.status_code == 200:
                models = response.json()['data']
                if models:
                    model_name = models[0]['id']
                    print(f"‚úÖ Server is running with model: {model_name}")
                else:
                    print("‚ùå No models found on server")
            else:
                print(f"‚ùå Server health check failed: {response.status_code}")
        except Exception as e:
            print(f"‚ö†Ô∏è Could not verify server status: {e}")
            print("‚ùå Server may not be running properly")
    else:
        print("‚ÑπÔ∏è START_SERVER is False - assuming server is managed externally")

In [None]:
# Multi-GPU Parallel Refinement - Looped Implementation
import glob

if ENABLE_REFINEMENT:
    print("üîÑ Running refinement inference (ENABLE_REFINEMENT mode)")
    
    if not IS_KAGGLE:
        %cd /workspace/arc-agi-2025

    # Use available servers from sampling phase
    if 'servers' not in globals() or not servers:
        print("‚ùå No servers available - falling back to single GPU mode")
        servers_to_use = [{'port': 8080, 'gpu_id': 0}]
        num_gpus = 1
    else:
        servers_to_use = servers
        num_gpus = len(servers)

    SUBSET = "test" if IS_RERUN else "evaluation"

    # Run R refinement phases
    for phase in range(1, R + 1):
        print(f"\nüéØ Phase {phase} of {R} refinement phases")
        print(f"üî• Running refinement phase {phase} across {num_gpus} GPU(s)")

        # Split attempts across GPUs for this phase
        attempts_per_gpu = REFINEMENT_ATTEMPTS_PER_PHASE // num_gpus
        remainder_attempts = REFINEMENT_ATTEMPTS_PER_PHASE % num_gpus

        print(f"Refinement Phase {phase} ‚Üí {'competition' if IS_RERUN else 'dev'} | total_attempts={REFINEMENT_ATTEMPTS_PER_PHASE} | {num_gpus} GPUs | subset={SUBSET} | timeout={REFINEMENT_TIMEOUT_PER_PHASE}s")

        # üîë Find ALL parquet files in inference_dir (from sampling + previous refinement phases)
        parquet_files = glob.glob(os.path.join(inference_dir, "*.parquet"))
        if not parquet_files:
            raise FileNotFoundError(f"No parquet files found in {inference_dir}")
        
        # Sort by creation time for consistent ordering
        parquet_files.sort(key=os.path.getctime)
        print(f"üìÇ Using ALL {len(parquet_files)} parquet files for refinement phase {phase}:")
        for pf in parquet_files:
            print(f"  ‚Ä¢ {os.path.basename(pf)}")

        refinement_processes = []

        # Launch one refinement task runner per GPU
        for i, server_info in enumerate(servers_to_use):
            gpu_id = server_info.get('gpu_id', i)
            port = server_info['port']

            # Add delay between launches to ensure unique parquet filenames
            if i > 0:
                print(f"‚è±Ô∏è Waiting 3 seconds before launching GPU {gpu_id} refinement to ensure unique timestamp...")
                time.sleep(3)
            
            # Distribute attempts (give remainder to first GPUs)
            attempts = attempts_per_gpu + (1 if i < remainder_attempts else 0)
            workers = MAX_WORKERS  # Each runner uses full worker count
            
            base_url = f"http://127.0.0.1:{port}/v1"
            
            print(f"GPU {gpu_id}: {attempts} refinement attempts ‚Üí port {port}")

            # Build the command for this GPU
            cmd_args = [
                "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
                "--dataset", DATASET,
                "--subset", SUBSET,
                "--max_workers", str(workers),
                "--max_attempts", str(attempts),
                "--model", model_name,
                "--base-url", base_url,
                "--unsafe-executor",
                "--single",
                "--refinement-ds"
            ]
            
            # Add ALL parquet files as refinement datasets (each GPU gets same files)
            cmd_args.extend(parquet_files)

            if 'qwen' in model_name.lower():
                cmd_args.extend(["--max-tokens", "2000"])
                cmd_args.extend(["--qwen-no-think"])
            else:
                cmd_args.extend(["--max-tokens", "64000"])

            # Add parquet output directory if set
            if os.getenv("ARC_PROGRAMS_PARQUET"):
              cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])
              cmd_args.extend(["--rex-stats"])

            log_file_path = f"{SUBMIT_DIR}/refinement_phase{phase}_gpu{gpu_id}.log"
            print(f"   Logging to: {log_file_path}")

            # Launch subprocess
            log_file = open(log_file_path, "w")
            process = subprocess.Popen(
                cmd_args,
                stdout=log_file,
                stderr=subprocess.STDOUT,
                text=True,
                cwd=os.getcwd()
            )
            
            refinement_processes.append({
                'gpu_id': gpu_id,
                'process': process,
                'log_file': log_file,
                'log_path': log_file_path,
                'attempts': attempts,
                'port': port
            })

        print(f"üöÄ Launched {len(refinement_processes)} parallel refinement processes for phase {phase}")
        print("Command example:", " ".join(cmd_args[:15]) + " ...")  # Truncate long command

        # Wait for all refinement processes to complete concurrently
        start_time = time.time()
        completed_refinement = []
        failed_refinement = []
        remaining_processes = refinement_processes.copy()

        print(f"‚è≥ Waiting for all {len(refinement_processes)} refinement processes to complete...")

        while remaining_processes and (time.time() - start_time) < REFINEMENT_TIMEOUT_PER_PHASE:
            for proc_info in remaining_processes.copy():
                gpu_id = proc_info['gpu_id']
                poll_result = proc_info['process'].poll()

                if poll_result is not None:  # Process has finished
                    remaining_processes.remove(proc_info)
                    proc_info['log_file'].close()

                    if poll_result == 0:
                        print(f"‚úÖ GPU {gpu_id} refinement phase {phase} completed naturally ({proc_info['attempts']} attempts)")
                        completed_refinement.append(proc_info)
                    else:
                        print(f"‚ùå GPU {gpu_id} refinement phase {phase} failed with return code {poll_result}")
                        print(f"üìù Check {proc_info['log_path']} for details")
                        failed_refinement.append(proc_info)

            if remaining_processes:
                time.sleep(1)  # Check every second

        # Handle any processes that timed out - treat as successful
        for proc_info in remaining_processes:
            gpu_id = proc_info['gpu_id']
            print(f"‚è∞ GPU {gpu_id} refinement phase {phase} timeout reached ({REFINEMENT_TIMEOUT_PER_PHASE}s) - terminating")
            proc_info['process'].terminate()
            try:
                proc_info['process'].wait(timeout=30)
                print(f"‚úÖ GPU {gpu_id} process terminated gracefully")
            except subprocess.TimeoutExpired:
                print(f"‚ö†Ô∏è GPU {gpu_id} process didn't terminate gracefully, forcing kill")
                proc_info['process'].kill()
                proc_info['process'].wait()

            proc_info['log_file'].close()
            # Treat timeout as successful completion
            print(f"‚úÖ GPU {gpu_id} refinement phase {phase} completed (timeout) ({proc_info['attempts']} attempts)")
            completed_refinement.append(proc_info)

        print(f"üìä Refinement Phase {phase} Results: {len(completed_refinement)} succeeded, {len(failed_refinement)} failed")

        if completed_refinement:
            total_completed_attempts = sum(p['attempts'] for p in completed_refinement)
            print(f"‚úÖ Successfully completed {total_completed_attempts} total refinement attempts for phase {phase} across {len(completed_refinement)} GPUs")
        else:
            print(f"‚ö†Ô∏è All refinement phase {phase} processes failed - continuing to next phase" if phase < R else "‚ö†Ô∏è All refinement phase {phase} processes failed")

    print(f"\nüéâ Completed all {R} refinement phases!")

else:
    print("üîÑ Skipping refinement phase (ENABLE_REFINEMENT=false)")
    print("   ‚Üí Standard mode runs sampling phase only")

In [None]:
# Generate submission using ALL parquet files

print("üéØ Generating submission from ALL parquet files in the directory...")

import subprocess

output_dir = str(SUBMIT_DIR)

# Command to generate submission using ALL parquet files
submission_cmd = [
    "uv", "run", "python", "-m", "llm_python.generate_submission",
    "--parquet-path", inference_dir,
    "--n-files", "-1",  # Use ALL files instead of just the 2 most recent
    "--dataset", DATASET,
    "--subset", SUBSET,
    "--output-dir", output_dir,
    "--debug"
]

print(f"Running submission generation: {' '.join(submission_cmd)}")
print(f"üìÇ Looking for parquet files in: {inference_dir}")

try:
    result = subprocess.run(
        submission_cmd,
        capture_output=True,
        text=True,
        timeout=300,  # 5 minute timeout
        cwd=os.getcwd()
    )
    
    if result.returncode == 0:
        print("‚úÖ Submission generation completed successfully!")
        print(result.stdout)
        
        # Update submit_dir to point to the generated file
        submit_dir = f"{output_dir}/submission.json"
        print(f"üìÅ Submission file: {submit_dir}")
    else:
        print(f"‚ùå Submission generation failed with return code {result.returncode}")
        print(f"STDOUT: {result.stdout}")
        print(f"STDERR: {result.stderr}")
        # Fallback to default submission path
        submit_dir = f"{SUBMIT_DIR}/submission.json"
        
except subprocess.TimeoutExpired:
    print("‚è±Ô∏è Submission generation timed out")
    submit_dir = f"{SUBMIT_DIR}/submission.json"
except Exception as e:
    print(f"‚ùå Submission generation error: {e}")
    submit_dir = f"{SUBMIT_DIR}/submission.json"

In [None]:
# Only score in dev/commit runs
if SCORE and not IS_RERUN:
    !uv run python -m llm_python.score_submission --submission {submit_dir} --dataset {DATASET} --subset {SUBSET}
else:
    print("Skipping local scoring (competition rerun or SCORE=False).")

In [None]:
# Multi-GPU cleanup - stop all servers and free resources
if START_SERVER and 'full_cleanup' in globals():
    print("üßπ Cleaning up all servers and resources...")
    full_cleanup()
elif START_SERVER and 'servers' in globals() and servers:
    print("üßπ Cleaning up all servers and resources...")
    
    # Stop all servers
    for server in servers:
        try:
            print(f"Stopping GPU {server['gpu_id']} server (PID {server['process'].pid})...")
            server['process'].terminate()
            server['process'].wait(timeout=10)
            server['log_file'].close()
        except Exception as e:
            print(f"Force killing GPU {server['gpu_id']} server...")
            server['process'].kill()

    # Kill any lingering sglang processes
    subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)

    # Clear CUDA memory on all GPUs
    try:
        import torch
        if torch.cuda.is_available():
            for gpu_id in range(torch.cuda.device_count()):
                torch.cuda.set_device(gpu_id)
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
            print(f"‚úÖ Cleared CUDA memory on {torch.cuda.device_count()} GPUs")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not clear CUDA memory: {e}")

    print("‚úÖ All servers stopped and CUDA memory cleared.")
    
else:
    print("üîç No multi-GPU cleanup needed (START_SERVER=False or no servers found)")