In [1]:
import os
from pathlib import Path

MODEL_PATH = "Trelis/Qwen3-4B_ds-arc-agi-1-partial-100-c1542" # Need a way to get this from local if needed.
DATASET = "arc-prize-2025"

# ---- Config flags (single source of truth) ----
START_SERVER = True
TEST_INFERENCE = False          # set False unless you want a quick endpoint smoke test
SCORE = True                   # default; overridden in branches below
ATTEMPTS = 64

# TTT Mode: Controls Test-Time Training workflow
TTT_MODE = True

# Env-backed flags
IS_KAGGLE = bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
IS_RERUN  = IS_KAGGLE and os.getenv("KAGGLE_IS_COMPETITION_RERUN", "").lower() == "true"

# String env flag for external tools
os.environ["SUBMIT"] = "true"

# ---- Paths ----
if IS_KAGGLE:
    ARC_DATA_ROOT   = Path("/kaggle/input")
    MODEL_SAVE_DIR = Path("/kaggle/working")
    SUBMIT_DIR      = Path("/kaggle/working")
    ARC_PROGRAMS_PARQUET = SUBMIT_DIR
    MODEL_PATH = ARC_DATA_ROOT / "arc-1-fake-ttt-blended-c802-dataset"

    # Auto-find model path by searching two levels deep
    model_found = None
    for dataset_dir in ARC_DATA_ROOT.iterdir():
      if dataset_dir.is_dir():
          for model_dir in dataset_dir.iterdir():
              if model_dir.is_dir() and "Qwen" in model_dir.name:
                  model_found = model_dir
                  break
      if model_found:
          break
    
    if model_found:
      MODEL_PATH = model_found
      print(f"üîç Auto-found Kaggle model: {MODEL_PATH}")
    else:
      # Fallback to the original hardcoded path structure
      MODEL_PATH = ARC_DATA_ROOT / "arc-1-fake-ttt-blended-c802-dataset"
      print(f"‚ö†Ô∏è  Model not auto-found, using fallback: {MODEL_PATH}")
else:
    ARC_DATA_ROOT   = Path("/workspace/arc-agi-2025/data")
    MODEL_SAVE_DIR = Path("/workspace/arc-agi-2025/llm_python/fine-tuning")
    SUBMIT_DIR      = Path("/workspace/arc-agi-2025/llm_python/submissions")
    ARC_PROGRAMS_PARQUET = Path("/workspace/arc-agi-2025/llm_python/datasets/inference")

# Initialize fine-tuned model path (will be set after fine-tuning)
FINE_TUNED_MODEL_PATH = None

# Export envs for downstream processes
os.environ["ARC_DATA_ROOT"]   = str(ARC_DATA_ROOT)
os.environ["MODEL_SAVE_DIR"] = str(MODEL_SAVE_DIR)
os.environ["SUBMIT_DIR"]      = str(SUBMIT_DIR)
os.environ["ARC_PROGRAMS_PARQUET"] = str(ARC_PROGRAMS_PARQUET)
os.environ["MODEL_PATH"] = str(MODEL_PATH)

# Export TTT and config flags for subprocess use
os.environ["TTT_MODE"] = str(TTT_MODE).lower()
os.environ["IS_KAGGLE"] = str(IS_KAGGLE).lower()
os.environ["IS_RERUN"] = str(IS_RERUN).lower()
os.environ["DATASET"] = DATASET

# Ensure directories exist
for p in (MODEL_SAVE_DIR, SUBMIT_DIR):
    p.mkdir(parents=True, exist_ok=True)

# ---- Timeouts & mode tweaks ----
FULL_TIMEOUT = 3600*5 - 600 # ~5 hour timeout for inference

if IS_RERUN:
    # Kaggle competition rerun
    timeout_seconds = FULL_TIMEOUT
    print(f"üèÜ Competition rerun detected ‚Äî setting FULL {timeout_seconds}s timeout for ARC task runner")
    TEST_INFERENCE = False
    SCORE = False
    os.environ["SUBMIT"] = "true"

elif not IS_KAGGLE:
    # Runpod / local long run
    timeout_seconds = FULL_TIMEOUT
    print(f"üñ•Ô∏è Runpod/local long run ‚Äî setting FULL {timeout_seconds}s timeout for ARC task runner")
    if os.getenv("SUBMIT", "false").lower() == "true":
        SCORE = True  # if we're generating a submission, do scoring

else:
    # Kaggle dev/testing
    timeout_seconds = 60  # 1 minute
    print(f"üîß Development run ‚Äî setting short {timeout_seconds}s timeout for testing")
    # Safer default: don't auto-submit in dev
    os.environ["SUBMIT"] = "true"

# Export timeout
os.environ["GLOBAL_TIMEOUT"] = str(timeout_seconds)
print(f"‚è∞ Global timeout set to {timeout_seconds}s ({timeout_seconds/3600:.1f} hours)")

# TTT Mode configuration
if TTT_MODE:
    print("üß™ TTT (Test-Time Training) mode ENABLED")
    print("   ‚Üí Will run: First inference ‚Üí Fine-tuning ‚Üí Second inference")
else:
    print("üîÑ Standard mode (TTT disabled)")
    print("   ‚Üí Will run: First inference only")

# Optional: quick summary (helps avoid accidental submits)
print(
    "Mode summary ‚Üí "
    f"IS_KAGGLE={IS_KAGGLE} | IS_RERUN={IS_RERUN} | TTT_MODE={TTT_MODE} |\n"
    f"TEST_INFERENCE={TEST_INFERENCE} | SCORE={SCORE} | SUBMIT={os.environ['SUBMIT']} | MODEL_PATH={MODEL_PATH}"
)

üîç Auto-found Kaggle model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542
üîß Development run ‚Äî setting short 60s timeout for testing
‚è∞ Global timeout set to 60s (0.0 hours)
üß™ TTT (Test-Time Training) mode ENABLED
   ‚Üí Will run: First inference ‚Üí Fine-tuning ‚Üí Second inference
Mode summary ‚Üí IS_KAGGLE=True | IS_RERUN=False | TTT_MODE=True |
TEST_INFERENCE=False | SCORE=True | SUBMIT=true | MODEL_PATH=/kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542


In [2]:
import sys
import torch
import numpy as np

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version (PyTorch): {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"NumPy version: {np.__version__}")
if torch.cuda.is_available():
   print(f"GPU count: {torch.cuda.device_count()}")
   print(f"GPU name: {torch.cuda.get_device_name(0)}")

Python version: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
PyTorch version: 2.7.1+cu126
CUDA version (PyTorch): 12.6
CUDA available: True
NumPy version: 1.26.4
GPU count: 4
GPU name: NVIDIA L4


In [3]:
import sglang
print("SGLang version:", sglang.__version__)

try:
    import flashinfer
    print("FlashInfer version:", flashinfer.__version__)
except ImportError:
    print("FlashInfer not installed")

SGLang version: 0.4.9.post3
FlashInfer version: 0.2.7.post1


In [4]:
if IS_KAGGLE:
    import os, shutil, subprocess, stat
    
    # 1) Where to place binaries + cache
    WRK_BIN = "/kaggle/working/bin"
    TRITON_CACHE = "/kaggle/working/.triton"
    os.makedirs(WRK_BIN, exist_ok=True)
    os.makedirs(TRITON_CACHE, exist_ok=True)
    
    # 2) Preferred source for ptxas/cuobjdump/nvdisasm
    SYSTEM_CUDA_BIN = "/usr/local/cuda/bin"
    FALLBACK_VENDORED = "/kaggle/usr/lib/sglang_utility/triton/backends/nvidia/bin"  # if you have it
    
    def copy_tool(name: str):
        for src_dir in (SYSTEM_CUDA_BIN, FALLBACK_VENDORED):
            src = os.path.join(src_dir, name)
            if os.path.exists(src):
                dst = os.path.join(WRK_BIN, name)
                shutil.copy2(src, dst)
                # ensure executable bit
                os.chmod(dst, os.stat(dst).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
                return dst
        raise FileNotFoundError(f"Could not find {name} in {SYSTEM_CUDA_BIN} or {FALLBACK_VENDORED}")
    
    ptxas_path = copy_tool("ptxas")
    try:
        cuobjdump_path = copy_tool("cuobjdump")
    except FileNotFoundError:
        cuobjdump_path = None  # optional
    try:
        nvdisasm_path = copy_tool("nvdisasm")
    except FileNotFoundError:
        nvdisasm_path = None  # optional
    
    # 3) Environment for Triton/JIT
    os.environ["TRITON_PTXAS_PATH"] = ptxas_path
    os.environ["PATH"] = f"{WRK_BIN}:{os.environ.get('PATH','')}"
    os.environ["TRITON_CACHE_DIR"] = TRITON_CACHE
    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["CUDA_PATH"] = "/usr/local/cuda"
    
    # Helpful fallbacks if you still hit capture issues:
    # os.environ["SGLANG_DISABLE_CUDA_GRAPH"] = "1"      # skip CUDA graphs (degrades perf but avoids capture)
    # os.environ["TRITON_CODEGEN_FATBIN"] = "0"          # can reduce Triton fatbin steps on some setups
    
    # 4) Smoke test: ensure ptxas runs from the new location
    print("ptxas ->", subprocess.check_output([ptxas_path, "--version"]).decode().strip())
    
    # Now it's safe to import heavy libs that trigger Triton
    import torch

ptxas -> ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:14:54_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [5]:
if START_SERVER:
  # Background server launcher for Kaggle with SGLang
  import os, sys, time, subprocess, json, socket, requests

  # ---------- 1) Check for existing server and cleanup ----------
  PORT = 8080
  HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"

  # Check if server already running
  try:
      r = requests.get(HEALTH_URL, timeout=3)
      if r.status_code == 200:
          print(f"Server already running on port {PORT}. Stopping it first...")
          # Kill existing sglang processes
          subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)
          time.sleep(3)  # Wait for cleanup
  except:
      pass  # No server running

  # Clear CUDA memory before starting
  try:
      import torch
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
          torch.cuda.synchronize()
          print("CUDA memory cleared.")
      num_gpus = torch.cuda.device_count()
  except Exception:
      num_gpus = 0

  if IS_KAGGLE:
      # Find the first directory inside ARC_DATA_ROOT for model path
      model_base_path = ARC_DATA_ROOT / "arc-1-fake-ttt-blended-c802-dataset"
      subdirs = [model_base_path / d for d in os.listdir(model_base_path) if (model_base_path / d).is_dir()]
      if not subdirs:
          raise RuntimeError(f"No model directory found in {model_base_path}")
      # Update MODEL_PATH for Kaggle environment (don't overwrite env variable)
      model_path_to_use = str(subdirs[0])   
      print(f"üîß Kaggle: Using model from {model_path_to_use}")
  else:
      model_path_to_use = str(MODEL_PATH)
      print(f"üîß Local: Using model from {model_path_to_use}")

  LOG = f"{SUBMIT_DIR}/sglang_server.log"
  print(f"LOG file path: {LOG}")

  SERVER_CMD = [
      sys.executable, "-m", "sglang.launch_server",
      "--host", "0.0.0.0",
      "--port", str(PORT),
      "--model-path", model_path_to_use,
      "--dp", str(max(1, min(num_gpus, 4))),
      "--kv-cache-dtype", "fp8_e4m3"
  ]

  # ---------- 2) Launch in background ----------
  log_f = open(LOG, "w")
  env = os.environ.copy()
  proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT, env=env, cwd=SUBMIT_DIR)
  print(f"Started sglang server PID={proc.pid} | logging to {LOG}")
  print("Command:", " ".join(SERVER_CMD))

  # ---------- 3) Wait for readiness ----------
  def wait_ready(url, timeout_s=180):
      t0 = time.time()
      while time.time() - t0 < timeout_s:
          try:
              r = requests.get(url, timeout=3)
              if r.status_code == 200:
                  return True
          except Exception:
              pass
          time.sleep(2)
      return False

  ready = wait_ready(HEALTH_URL)
  log_f.flush()

  if ready:
      print(f"sglang is READY on port {PORT}.")
      print(f"- Tail logs: !tail -n 50 {LOG}")
      print(f"- List models: !curl -s http://127.0.0.1:{PORT}/v1/models | jq .")
  else:
      print(f"sglang not ready after timeout. Showing last 60 log lines:")
      log_f.close()
      !tail -n 60 {LOG}

  # ---------- 4) Cleanup functions ----------
  def stop_server(p=proc):
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()
      print("Server stopped.")

  def full_cleanup(p=proc):
      # Stop server
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()

      # Also kill any lingering sglang processes
      subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)

      # Clear CUDA memory
      try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
      except:
          pass

      print("Server stopped and CUDA memory cleared.")

  print("Call stop_server() or full_cleanup() to shut it down gracefully.")

CUDA memory cleared.
üîß Kaggle: Using model from /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542
LOG file path: /kaggle/working/sglang_server.log
Started sglang server PID=3823 | logging to /kaggle/working/sglang_server.log
Command: /usr/bin/python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --dp 4 --kv-cache-dtype fp8_e4m3
sglang is READY on port 8080.
- Tail logs: !tail -n 50 /kaggle/working/sglang_server.log
- List models: !curl -s http://127.0.0.1:8080/v1/models | jq .
Call stop_server() or full_cleanup() to shut it down gracefully.


In [6]:
if START_SERVER:
    import requests
    import time
    
    def check_models():
        url = "http://127.0.0.1:8080/v1/models"
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            result = response.json()
    
            print("‚úÖ Server is responding!")
            print("Available models:")
            for model in result['data']:
                print(f"  - {model['id']}")
    
            return result['data'][0]['id'] if result['data'] else None
    
        except requests.exceptions.ConnectionError:
            print("‚ùå Connection failed - server may not be ready yet")
            return None
        except Exception as e:
            print(f"‚ùå Error: {e}")
            return None
    
    # Poll every 30 seconds until we get a model
    model_name = None
    while not model_name:
        model_name = check_models()
        if not model_name:
            print("‚è≥ Waiting 30 seconds before retrying...")
            time.sleep(30)
    
    print(f"\n‚úÖ Found model: {model_name}")

‚úÖ Server is responding!
Available models:
  - /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542

‚úÖ Found model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542


In [7]:
if TEST_INFERENCE:
    import time
    import requests
    
    url = "http://127.0.0.1:8080/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json"
    }
    
    messages = [
        {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
        {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10\u00d710 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
    ]
    
    payload = {
        "model": model_name,  # from your polling loop
        "messages": messages,
        # "max_tokens": 1000
        "max_tokens": 10
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload, timeout=600)
    end_time = time.time()
    
    response.raise_for_status()
    result = response.json()
    output_text = result["choices"][0]["message"]["content"]
    
    # Estimate token count (4 chars/token assumption)
    estimated_tokens = len(output_text) / 4
    elapsed_time = end_time - start_time
    tokens_per_second = estimated_tokens / elapsed_time
    
    print("‚úÖ Response received:")
    print(output_text)
    print(f"\n‚è± Elapsed time: {elapsed_time:.2f} seconds")
    print(f"üî¢ Estimated tokens: {estimated_tokens:.1f}")
    print(f"‚ö° Output tokens/sec: {tokens_per_second:.2f}")

In [8]:
if not IS_KAGGLE:
    %cd /workspace/arc-agi-2025

# Derive attempts/workers for the two modes
MAX_ATTEMPTS = ATTEMPTS if (IS_RERUN or not IS_KAGGLE) else 8
MAX_WORKERS  = 16

# SUBSET = "test" # defaulting to test to ensure there are no loading issues.

# can use this instead if testing evaluation during a pre-run
SUBSET = "test" if IS_RERUN else "evaluation"

# Common env for your runner
os.environ["OPENAI_API_KEY"] = "EMPTY"

print(f"Mode: {'competition' if IS_RERUN else 'dev'} | attempts={MAX_ATTEMPTS} | workers={MAX_WORKERS} | subset={SUBSET}")

# Build the command
cmd_args = [
    "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
    "--dataset", DATASET,
    "--subset", SUBSET,
    "--max_workers", str(MAX_WORKERS),
    "--max_attempts", str(MAX_ATTEMPTS),
    "--model", model_name,
    "--base-url", "http://127.0.0.1:8080/v1",
    "--unsafe-executor",
    "--max-tokens", "2000",
    "--qwen-no-think"
]


# Add parquet output directory if set
if os.getenv("ARC_PROGRAMS_PARQUET"):
  cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])

print(f"Running command: {' '.join(cmd_args)}")

# Handle output redirection properly
if IS_RERUN or not IS_KAGGLE:
    # For quiet mode, redirect to file using subprocess
    import subprocess
    log_file_path = f"{SUBMIT_DIR}/run.log"
    print(f"üìù Logging output to: {log_file_path}")
    
    with open(log_file_path, "w") as log_file:
        process = subprocess.Popen(
            cmd_args,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
            cwd=os.getcwd()
        )
        
        # Wait for completion
        print("‚è≥ Running tasks (output being written to log file)...")
        return_code = process.wait()
        
    if return_code == 0:
        print(f"‚úÖ Task runner completed successfully. Check {log_file_path} for details.")
    else:
        print(f"‚ùå Task runner failed with return code {return_code}")
        print(f"üìù Check {log_file_path} for error details")
        # Show last few lines of log
        !tail -n 20 {log_file_path}
else:
    # For interactive mode, show output directly
    cmd = " ".join(cmd_args)
    print(f"Running: {cmd}\n")
    !{cmd}

Mode: dev | attempts=8 | workers=16 | subset=evaluation
Running command: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 16 --max_attempts 8 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think --parquet-output-dir /kaggle/working
Running: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 16 --max_attempts 8 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think --parquet-output-dir /kaggle/working

‚è∞ Global timeout set to 60s via GLOBAL_TIMEOUT environment variable
‚è∞ API timeout: 300s (network safety only, no infrastructure timeouts)
üóÑÔ∏è Sampled programs will be logged to /kaggle/working/20250825_130

In [9]:
print(f"MODEL_PATH type: {type(MODEL_PATH)}, value: {MODEL_PATH}")

MODEL_PATH type: <class 'pathlib.PosixPath'>, value: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542


In [10]:
# Fine-tuning Integration - TTT Mode Only
# Only runs when TTT_MODE=true

import time
import sys
import requests
import subprocess
from pathlib import Path

# Enable fine-tuning only in TTT mode
ENABLE_FINE_TUNING = TTT_MODE

if ENABLE_FINE_TUNING:
    print("üî¨ Fine-tuning enabled - TTT mode detected, will fine-tune model on non-transductive programs")
    
    # Hub push control: Kaggle=false, Non-Kaggle=true
    PUSH_TO_HUB = not IS_KAGGLE
    print(f"üì§ Hub push setting: {'ENABLED' if PUSH_TO_HUB else 'DISABLED'} (Kaggle={IS_KAGGLE})")
    
    # Set environment variables for fine-tuning
    fine_tuning_env = {
        'MODEL_SLUG': str(MODEL_PATH),
        'FINE_TUNING_MODE': 'final_only',     # TTT mode uses final_only
        'DATA_SOURCE': 'parquet',             # Load from parquet files
        'ARC_PROGRAMS_PARQUET': str(ARC_PROGRAMS_PARQUET),  # Parquet directory path
        'MODEL_SAVE_DIR': str(MODEL_SAVE_DIR), # Where to save fine-tuned model
        'PUSH_TO_HUB': str(PUSH_TO_HUB).lower(),  # Hub push control
    }
    
    print("üõ†Ô∏è Fine-tuning configuration:")
    for key, value in fine_tuning_env.items():
        print(f"   {key}: {value}")
        os.environ[key] = value
    
    # Stop the current server to free up GPU memory
    if 'proc' in locals():
        print("üõë Stopping inference server to free GPU memory for fine-tuning...")
        try:
            proc.terminate()
            proc.wait(timeout=10)
        except:
            proc.kill()
        
        # Clear CUDA memory
        try:
            import torch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
                print("‚úÖ CUDA memory cleared")
        except:
            pass
    
    # Ensure we're in the right directory
    original_cwd = os.getcwd()
    if not IS_KAGGLE:
      os.chdir("/workspace/arc-agi-2025")
    
    # Set up logging
    import datetime
    log_dir = Path(os.environ.get("SUBMIT_DIR", "logs"))
    log_dir.mkdir(exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = log_dir / f"fine_tuning_{timestamp}.log"
    
    def log_and_print(message, log_file_handle=None):
      """Write to both console and log file"""
      print(message)
      if log_file_handle:
          log_file_handle.write(message + "\n")
          log_file_handle.flush()
    
    try:
      with open(log_file, 'w') as f:
          log_and_print(f"üìù Logging to: {log_file}", f)
    
          if not IS_KAGGLE:
              # Step 1: Convert notebook to script
              log_and_print("üîÑ Converting notebook to script...", f)
              convert_cmd = [
                  "uv", "run", "python",
                  "llm_python/fine-tuning/notebook_to_script.py",
                  "llm_python/fine-tuning/unsloth_arc_finetuning_soar.ipynb"
              ]
    
              convert_result = subprocess.run(convert_cmd,
                                            capture_output=True,
                                            text=True,
                                            timeout=60)
    
              # Log full output
              f.write("=== CONVERSION OUTPUT ===\n")
              f.write(f"Return code: {convert_result.returncode}\n")
              f.write(f"STDOUT:\n{convert_result.stdout}\n")
              f.write(f"STDERR:\n{convert_result.stderr}\n")
              f.write("========================\n\n")
              f.flush()
    
              if convert_result.returncode != 0:
                  log_and_print(f"‚ùå Notebook conversion failed: {convert_result.stderr}", f)
                  raise Exception("Notebook conversion failed")
    
              log_and_print("‚úÖ Notebook converted successfully", f)
    
          # Step 2: Run the actual fine-tuning
          log_and_print("üöÄ Starting fine-tuning...", f)
          fine_tuning_cmd = [
            "uv", "run", "python", "-u", "-m",
            "llm_python.fine-tuning.unsloth_arc_finetuning_soar",
            "--config", "llm_python/fine-tuning/config.yaml"
          ]
    
          log_and_print(f"Running command: {' '.join(fine_tuning_cmd)}", f)
    
          # Run with real-time output
          process = subprocess.Popen(
              fine_tuning_cmd,
              stdout=subprocess.PIPE,
              stderr=subprocess.STDOUT,
              text=True,
              bufsize=1
          )
    
          f.write("=== FINE-TUNING OUTPUT ===\n")
          f.flush()
    
          # Stream output to both console and file
          for line in process.stdout:
              print(line, end='')  # Show in console
              f.write(line)  # Save to file
              f.flush()
    
          # Wait for completion
          return_code = process.wait(timeout=6400)  # 2 hour timeout
    
          f.write(f"\n=== PROCESS COMPLETED WITH CODE: {return_code} ===\n")
          f.flush()
    
          if return_code == 0:
              log_and_print("‚úÖ Fine-tuning completed successfully!", f)
    
              # Find the fine-tuned model
              fine_tuned_models = list(Path(MODEL_SAVE_DIR).glob("*-final"))
              if fine_tuned_models:
                  new_model_path = fine_tuned_models[0]
                  log_and_print(f"üéØ Fine-tuned model saved at: {new_model_path}", f)
    
                  # Set the fine-tuned model path (don't overwrite original MODEL_PATH)
                  global FINE_TUNED_MODEL_PATH
                  FINE_TUNED_MODEL_PATH = str(new_model_path)
                  log_and_print(f"üîÑ Set FINE_TUNED_MODEL_PATH: {FINE_TUNED_MODEL_PATH}", f)
              else:
                  log_and_print("‚ö†Ô∏è  Fine-tuned model not found, will use original model", f)
          else:
              log_and_print(f"‚ùå Fine-tuning failed with return code {return_code}", f)
              log_and_print("üîÑ Will use original model...", f)
    
          log_and_print(f"üìÑ Full logs saved to: {log_file}", f)
            
    except subprocess.TimeoutExpired:
        print("‚è±Ô∏è Fine-tuning timed out after 2 hours, will use original model")
    except Exception as e:
        print(f"‚ùå Fine-tuning error: {e}")
        print("üîÑ Will use original model...")
    finally:
        # Restore original directory
        os.chdir(original_cwd)
    
    # Clean up environment variables
    for key in fine_tuning_env:
        os.environ.pop(key, None)
    
else:
    print("üîÑ Fine-tuning disabled - TTT mode not enabled")
    if TTT_MODE:
        print("   (TTT_MODE=true detected, but ENABLE_FINE_TUNING override disabled fine-tuning)")
    else:
        print(f"   (Set TTT_MODE=true to enable Test-Time Training workflow)")
    print(f"   Will use pre-loaded model: {MODEL_PATH}")

üî¨ Fine-tuning enabled - TTT mode detected, will fine-tune model on non-transductive programs
üì§ Hub push setting: DISABLED (Kaggle=True)
üõ†Ô∏è Fine-tuning configuration:
   MODEL_SLUG: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542
   FINE_TUNING_MODE: final_only
   DATA_SOURCE: parquet
   ARC_PROGRAMS_PARQUET: /kaggle/working
   MODEL_SAVE_DIR: /kaggle/working
   PUSH_TO_HUB: false
üõë Stopping inference server to free GPU memory for fine-tuning...
‚úÖ CUDA memory cleared
üìù Logging to: /kaggle/working/fine_tuning_20250825_130553.log
üöÄ Starting fine-tuning...
Running command: uv run python -u -m llm_python.fine-tuning.unsloth_arc_finetuning_soar --config llm_python/fine-tuning/config.yaml
Config file llm_python/fine-tuning/config.yaml not found! Using default values.
üìä Data source: parquet (/kaggle/working)
Config loaded:
  config_path: llm_python/fine-tuning/config.yaml
  test_run: False
  execution_mode: final_only
  data_so

In [11]:
!ls /kaggle/working

20250825_130428__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Qwen3-4B_ds-arc-agi-1-partial-100-c1542_arc-prize-2025_evaluation.parquet
bin
fine_tuning_20250825_121545.log
fine_tuning_20250825_121851.log
fine_tuning_20250825_121952.log
fine_tuning_20250825_122001.log
fine_tuning_20250825_123210.log
fine_tuning_20250825_123409.log
fine_tuning_20250825_123728.log
fine_tuning_20250825_123759.log
fine_tuning_20250825_130553.log
sglang_server.log
unsloth_compiled_cache


In [13]:
if ENABLE_FINE_TUNING:
  # Restart the server with the (potentially) new model
  if START_SERVER:
      print("üîÑ Restarting inference server with updated model...")

      # Gracefully stop existing server if it exists
      if 'proc' in locals() and proc.poll() is None:  # Check if process is still running
          print("üõë Gracefully stopping existing server...")
          try:
              proc.terminate()  # Send SIGTERM first
              proc.wait(timeout=30)  # Wait up to 30 seconds for graceful shutdown
              print("‚úÖ Server stopped gracefully")
          except subprocess.TimeoutExpired:
              print("‚ö†Ô∏è  Server didn't stop gracefully, force killing...")
              proc.kill()
              proc.wait()
          except Exception as e:
              print(f"‚ö†Ô∏è  Error stopping server: {e}")

      # Wait a bit longer after graceful shutdown
      time.sleep(5)

      # Clear CUDA memory
      try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
              print("‚úÖ CUDA memory cleared")
      except Exception:
          pass

      # Get GPU count
      try:
          import torch
          num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
      except:
          num_gpus = 1

      # Choose which model to use: fine-tuned if available, otherwise original
      model_to_use = FINE_TUNED_MODEL_PATH if FINE_TUNED_MODEL_PATH else MODEL_PATH
      print(f"üéØ Using model: {model_to_use}")
      print(f"   ‚Üí {'Fine-tuned' if FINE_TUNED_MODEL_PATH else 'Original'} model")

      # Restart server with appropriate model
      PORT = 8080
      LOG = f"{SUBMIT_DIR}/sglang_server.log"
      SERVER_CMD = [
          sys.executable, "-m", "sglang.launch_server",
          "--host", "0.0.0.0",
          "--port", str(PORT),
          "--model-path", str(model_to_use),
          "--dp", str(max(1, min(num_gpus, 4))),
          "--kv-cache-dtype", "fp8_e4m3"
      ]

      print(f"üöÄ Starting server: {' '.join(SERVER_CMD)}")

      log_f = open(LOG, "a")
      proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT,
                             env=os.environ.copy(), cwd=SUBMIT_DIR)

      print(f"‚úÖ Server started with PID={proc.pid}")

      # Wait for readiness with better error handling
      def wait_ready(url, timeout_s=600):
          t0 = time.time()
          while time.time() - t0 < timeout_s:
              try:
                  r = requests.get(url, timeout=5)
                  if r.status_code == 200:
                      return True
              except Exception:
                  pass
              time.sleep(3)  # Check less frequently
          return False

      HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"
      if wait_ready(HEALTH_URL):
          print("‚úÖ Server ready!")

          # Update model_name
          try:
              response = requests.get(HEALTH_URL)
              if response.status_code == 200:
                  models = response.json()['data']
                  if models:
                      model_name = models[0]['id']
                      print(f"üéØ Model: {model_name}")
          except Exception as e:
              print(f"‚ö†Ô∏è  Could not get model name: {e}")
      else:
          print("‚ùå Server failed to start properly")

üîÑ Restarting inference server with updated model...
‚úÖ CUDA memory cleared
üéØ Using model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542
   ‚Üí Original model
üöÄ Starting server: /usr/bin/python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --dp 4 --kv-cache-dtype fp8_e4m3
‚úÖ Server started with PID=5950
‚úÖ Server ready!
üéØ Model: /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542


In [14]:
# Second Inference Run - TTT Mode Only
# Only runs when TTT_MODE=true (after fine-tuning)

if TTT_MODE:
    print("üîÑ Running SECOND inference with fine-tuned model (TTT mode)")
    
    if not IS_KAGGLE:
        %cd /workspace/arc-agi-2025

    # Derive attempts/workers for the two modes
    MAX_ATTEMPTS = ATTEMPTS if (IS_RERUN or not IS_KAGGLE) else 8
    MAX_WORKERS  = 16

    # SUBSET = "test" # defaulting to test to ensure there are no loading issues.

    # can use this instead if testing evaluation during a pre-run
    SUBSET = "test" if IS_RERUN else "evaluation"

    # Common env for your runner
    os.environ["OPENAI_API_KEY"] = "EMPTY"

    print(f"TTT Second Run ‚Üí {'competition' if IS_RERUN else 'dev'} | attempts={MAX_ATTEMPTS} | workers={MAX_WORKERS} | subset={SUBSET}")

    # Build the command
    cmd_args = [
        "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
        "--dataset", DATASET,
        "--subset", SUBSET,
        "--max_workers", str(MAX_WORKERS),
        "--max_attempts", str(MAX_ATTEMPTS),
        "--model", model_name,
        "--base-url", "http://127.0.0.1:8080/v1",
        "--unsafe-executor",
        "--max-tokens", "2000",
        "--qwen-no-think"
    ]

    # Add parquet output directory if set
    if os.getenv("ARC_PROGRAMS_PARQUET"):
      cmd_args.extend(["--parquet-output-dir", os.getenv("ARC_PROGRAMS_PARQUET")])

    print(f"Running TTT second inference: {' '.join(cmd_args)}")

    # Handle output redirection properly
    if IS_RERUN or not IS_KAGGLE:
        # For quiet mode, redirect to file using subprocess
        import subprocess
        log_file_path = f"{SUBMIT_DIR}/run_ttt_second.log"
        print(f"üìù Logging TTT second run output to: {log_file_path}")
        
        with open(log_file_path, "w") as log_file:
            process = subprocess.Popen(
                cmd_args,
                stdout=log_file,
                stderr=subprocess.STDOUT,
                text=True,
                cwd=os.getcwd()
            )
            
            # Wait for completion
            print("‚è≥ Running TTT second inference (output being written to log file)...")
            return_code = process.wait()
            
        if return_code == 0:
            print(f"‚úÖ TTT second inference completed successfully. Check {log_file_path} for details.")
        else:
            print(f"‚ùå TTT second inference failed with return code {return_code}")
            print(f"üìù Check {log_file_path} for error details")
            # Show last few lines of log
            !tail -n 20 {log_file_path}
    else:
        # For interactive mode, show output directly
        cmd = " ".join(cmd_args)
        print(f"Running TTT second inference: {cmd}\n")
        !{cmd}

else:
    print("üîÑ Skipping second inference (TTT_MODE=false)")
    print("   ‚Üí Standard mode runs first inference only")

üîÑ Running SECOND inference with fine-tuned model (TTT mode)
TTT Second Run ‚Üí dev | attempts=8 | workers=16 | subset=evaluation
Running TTT second inference: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 16 --max_attempts 8 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think
Running TTT second inference: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2025 --subset evaluation --max_workers 16 --max_attempts 8 --model /kaggle/input/arc-1-fake-ttt-blended-c802-dataset/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think

‚è∞ Global timeout set to 60s via GLOBAL_TIMEOUT environment variable
‚è∞ API timeout: 300s (network safety only, no infrastructure timeouts)
üóÑÔ∏è Sampled programs will be lo

In [18]:
# Generate submission using the two most recent parquet files
if os.environ.get("SUBMIT", "false").lower() == "true":
    print("üéØ Generating submission from the two most recent parquet files...")
    
    import subprocess
    
    # Set up paths - parquet files are saved by task runner in different locations
    if IS_KAGGLE:
        # On Kaggle, parquet files are saved directly in /kaggle/working by task runner
        inference_dir = "/kaggle/working"
    else:
        # On RunPod/local, parquet files are saved in llm_python/datasets/inference
        inference_dir = "llm_python/datasets/inference"
    
    output_dir = str(SUBMIT_DIR)
    
    # Command to generate submission using the two most recent parquet files
    submission_cmd = [
        "uv", "run", "python", "-m", "llm_python.generate_submission",
        "--parquet-path", inference_dir,
        "--n-files", "2",
        "--dataset", DATASET,
        "--subset", SUBSET,
        "--output-dir", output_dir,
        "--debug"
    ]
    
    print(f"Running submission generation: {' '.join(submission_cmd)}")
    print(f"üìÇ Looking for parquet files in: {inference_dir}")
    
    try:
        result = subprocess.run(
            submission_cmd,
            capture_output=True,
            text=True,
            timeout=300,  # 5 minute timeout
            cwd=os.getcwd()
        )
        
        if result.returncode == 0:
            print("‚úÖ Submission generation completed successfully!")
            print(result.stdout)
            
            # Update submit_dir to point to the generated file
            submit_dir = f"{output_dir}/submission.json"
            print(f"üìÅ Submission file: {submit_dir}")
        else:
            print(f"‚ùå Submission generation failed with return code {result.returncode}")
            print(f"STDOUT: {result.stdout}")
            print(f"STDERR: {result.stderr}")
            # Fallback to default submission path
            submit_dir = f"{SUBMIT_DIR}/submission.json"
            
    except subprocess.TimeoutExpired:
        print("‚è±Ô∏è Submission generation timed out")
        submit_dir = f"{SUBMIT_DIR}/submission.json"
    except Exception as e:
        print(f"‚ùå Submission generation error: {e}")
        submit_dir = f"{SUBMIT_DIR}/submission.json"
else:
    print("üìù Skipping submission generation (SUBMIT=false)")
    submit_dir = f"{SUBMIT_DIR}/submission.json"

üéØ Generating submission from the two most recent parquet files...
Running submission generation: uv run python -m llm_python.generate_submission --parquet-path /kaggle/working --n-files 2 --dataset arc-prize-2025 --subset evaluation --output-dir /kaggle/working --debug
üìÇ Looking for parquet files in: /kaggle/working
‚úÖ Submission generation completed successfully!
üîç Selected 1 most recent parquet files from /kaggle/working:
  ‚Ä¢ 20250825_130428__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Qwen3-4B_ds-arc-agi-1-partial-100-c1542_arc-prize-2025_evaluation.parquet (modified: 2025-08-25 13:05:49)
‚úÖ Loaded 49 rows from /kaggle/working/20250825_130428__kaggle_input_arc-1-fake-ttt-blended-c802-dataset_Qwen3-4B_ds-arc-agi-1-partial-100-c1542_arc-prize-2025_evaluation.parquet
üìä Combined data: 49 total rows from 1 files
üéØ Generating submission for 120 tasks from arc-prize-2025/evaluation
‚ö†Ô∏è No attempts for task 0934a4d8, using empty fallback
‚ö†Ô∏è No attempts for tas

In [19]:
# Only score in dev/commit runs
if SCORE and not IS_RERUN:
    !uv run python -m llm_python.score_submission --submission {submit_dir} --dataset {DATASET} --subset {SUBSET}
else:
    print("Skipping local scoring (competition rerun or SCORE=False).")

üîç Validating submission file: /kaggle/working/submission.json

üîç VALIDATING SUBMISSION: /kaggle/working/submission.json
üìä Validation Results:
  Total tasks: 120
  Total predictions: 172
  Empty predictions ([[0,0],[0,0]]): 314
‚úÖ VALIDATION PASSED - No structural errors found
üéØ Submission file is ready for competition!
üìÇ Loading submission: /kaggle/working/submission.json
üîç Scoring against arc-prize-2025/evaluation
SUBMISSION SCORING RESULTS
Dataset: arc-prize-2025
Subset: evaluation
Reference tasks: 120
Tasks scored: 120
Total predictions: 344

üìä PREDICTION-LEVEL METRICS:
  Pass@1 (first attempt): 0/344 (0.0%)
  Pass@2 (either attempt): 0/344 (0.0%)

üìä TASK-LEVEL METRICS:
  Tasks Pass@1 (all outputs correct on first attempt): 0/120 (0.0%)
  Tasks Pass@2 (all outputs correct on either attempt): 0/120 (0.0%)


In [20]:
# Final cleanup - stop server and free resources
if START_SERVER and 'full_cleanup' in globals():
    print("üßπ Cleaning up server and resources...")
    full_cleanup()
else:
    print("üîç No server cleanup needed (START_SERVER=False or cleanup function not available)")

üßπ Cleaning up server and resources...
Server stopped and CUDA memory cleared.
