In [1]:
import os
from pathlib import Path

MODEL_PATH = "Trelis/Qwen3-4B_ds-arc-agi-1-partial-100-c1542" # Need a way to get this from local if needed.
DATASET = "arc-prize-2024"

# ---- Config flags (single source of truth) ----
START_SERVER = True
TEST_INFERENCE = False          # set False unless you want a quick endpoint smoke test
SCORE = True                   # default; overridden in branches below

# Env-backed flags
IS_KAGGLE = bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
IS_RERUN  = IS_KAGGLE and os.getenv("KAGGLE_IS_COMPETITION_RERUN", "").lower() == "true"

# String env flag for external tools
os.environ["SUBMIT"] = "true"

# ---- Paths ----
if IS_KAGGLE:
    ARC_DATA_ROOT   = Path("/kaggle/input")
    MODEL_SAVE_DIR = Path("/kaggle/working")
    SUBMIT_DIR      = Path("/kaggle/working")
    ARC_PROGRAMS_DB = MODEL_SAVE_DIR / "local.db"
    MODEL_PATH = ARC_DATA_ROOT / "arc-1-fake-ttt-blended-c802-dataset"
else:
    ARC_DATA_ROOT   = Path("/workspace/arc-agi-2025/data")
    MODEL_SAVE_DIR = Path("/workspace/arc-agi-2025/llm_python/fine-tuning")
    SUBMIT_DIR      = Path("/workspace/arc-agi-2025/llm_python/submissions")
    ARC_PROGRAMS_DB = SUBMIT_DIR / "local.db"

# Export envs for downstream processes
os.environ["ARC_DATA_ROOT"]   = str(ARC_DATA_ROOT)
os.environ["MODEL_SAVE_DIR"] = str(MODEL_SAVE_DIR)
os.environ["SUBMIT_DIR"]      = str(SUBMIT_DIR)
os.environ["ARC_PROGRAMS_DB"] = str(ARC_PROGRAMS_DB)

# Ensure directories exist
for p in (MODEL_SAVE_DIR, SUBMIT_DIR):
    p.mkdir(parents=True, exist_ok=True)

# ---- Timeouts & mode tweaks ----
FULL_TIMEOUT = 3600*5 - 600 # ~5 hour timeout for inference

if IS_RERUN:
    # Kaggle competition rerun
    timeout_seconds = FULL_TIMEOUT
    print(f"üèÜ Competition rerun detected ‚Äî setting FULL {timeout_seconds}s timeout for ARC task runner")
    TEST_INFERENCE = False
    SCORE = False
    os.environ["SUBMIT"] = "true"

elif not IS_KAGGLE:
    # Runpod / local long run
    timeout_seconds = FULL_TIMEOUT
    print(f"üñ•Ô∏è Runpod/local long run ‚Äî setting FULL {timeout_seconds}s timeout for ARC task runner")
    if os.getenv("SUBMIT", "false").lower() == "true":
        SCORE = True  # if we're generating a submission, do scoring

else:
    # Kaggle dev/testing
    timeout_seconds = 60  # 1 minute
    print(f"üîß Development run ‚Äî setting short {timeout_seconds}s timeout for testing")
    # Safer default: don't auto-submit in dev
    os.environ["SUBMIT"] = "true"

# Export timeout
os.environ["GLOBAL_TIMEOUT"] = str(timeout_seconds)
print(f"‚è∞ Global timeout set to {timeout_seconds}s ({timeout_seconds/3600:.1f} hours)")

# Optional: quick summary (helps avoid accidental submits)
print(
    "Mode summary ‚Üí "
    f"IS_KAGGLE={IS_KAGGLE} | IS_RERUN={IS_RERUN} | TEST_INFERENCE={TEST_INFERENCE} |\n"
    f"SCORE={SCORE} | SUBMIT={os.environ['SUBMIT']} | MODEL_PATH={MODEL_PATH}"
)

üñ•Ô∏è Runpod/local long run ‚Äî setting FULL 17400s timeout for ARC task runner
‚è∞ Global timeout set to 17400s (4.8 hours)
Mode summary ‚Üí IS_KAGGLE=False | IS_RERUN=False | TEST_INFERENCE=False |
SCORE=True | SUBMIT=true | MODEL_PATH=Trelis/Qwen3-4B_ds-arc-agi-1-partial-100-c1542


In [None]:
import sys
import torch
import numpy as np

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version (PyTorch): {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"NumPy version: {np.__version__}")
if torch.cuda.is_available():
   print(f"GPU count: {torch.cuda.device_count()}")
   print(f"GPU name: {torch.cuda.get_device_name(0)}")

In [None]:
import sglang
print("SGLang version:", sglang.__version__)

try:
    import flashinfer
    print("FlashInfer version:", flashinfer.__version__)
except ImportError:
    print("FlashInfer not installed")

In [None]:
if IS_KAGGLE:
    # ensure that ptxas can access writable directories
    import shutil
    import os
    import sys
    import subprocess
    
    # Copy PTXAS and other binaries
    os.makedirs("/kaggle/working/bin", exist_ok=True)
    for binary in ["ptxas", "cuobjdump", "nvdisasm"]:
        src = f"/kaggle/usr/lib/sglang_utility/triton/backends/nvidia/bin/{binary}"  # Fixed path
        dst = f"/kaggle/working/bin/{binary}"
        if os.path.exists(src):
            shutil.copy(src, dst)
            os.chmod(dst, 0o755)
    
    # Set environment variables
    env = os.environ.copy()
    env["TRITON_PTXAS_PATH"] = "/kaggle/working/bin/ptxas"
    env["PATH"] = f"/kaggle/working/bin:{env.get('PATH', '')}"
    env["TRITON_CACHE_DIR"] = "/kaggle/working/.triton"
    env["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.makedirs("/kaggle/working/.triton", exist_ok=True)
    
    # Apply the environment variables to the current process
    os.environ.update(env)

In [None]:
if START_SERVER:
  # Background server launcher for Kaggle with SGLang
  import os, sys, time, subprocess, json, socket, requests

  # ---------- 1) Check for existing server and cleanup ----------
  PORT = 8080
  HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"

  # Check if server already running
  try:
      r = requests.get(HEALTH_URL, timeout=3)
      if r.status_code == 200:
          print(f"Server already running on port {PORT}. Stopping it first...")
          # Kill existing sglang processes
          subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)
          time.sleep(3)  # Wait for cleanup
  except:
      pass  # No server running

  # Clear CUDA memory before starting
  try:
      import torch
      if torch.cuda.is_available():
          torch.cuda.empty_cache()
          torch.cuda.synchronize()
          print("CUDA memory cleared.")
      num_gpus = torch.cuda.device_count()
  except Exception:
      num_gpus = 0

  if IS_KAGGLE:
      # Find the first directory inside ARC_DATA_ROOT for model path
      model_base_path = ARC_DATA_ROOT / "arc-1-fake-ttt-blended-c802-dataset"
      subdirs = [model_base_path / d for d in os.listdir(model_base_path) if (model_base_path / d).is_dir()]
      if not subdirs:
          raise RuntimeError(f"No model directory found in {model_base_path}")
      MODEL_PATH = str(subdirs[0])   # or pick max(subdirs) if multiple exist

  LOG = f"{SUBMIT_DIR}/sglang_server.log"
  print(f"LOG file path: {LOG}")

  SERVER_CMD = [
      sys.executable, "-m", "sglang.launch_server",
      "--host", "0.0.0.0",
      "--port", str(PORT),
      "--model-path", MODEL_PATH,
      "--dp", str(max(1, min(num_gpus, 4))),
      "--kv-cache-dtype", "fp8_e4m3"
  ]

  # ---------- 2) Launch in background ----------
  log_f = open(LOG, "w")
  env = os.environ.copy()
  proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT, env=env, cwd=SUBMIT_DIR)
  print(f"Started sglang server PID={proc.pid} | logging to {LOG}")
  print("Command:", " ".join(SERVER_CMD))

  # ---------- 3) Wait for readiness ----------
  def wait_ready(url, timeout_s=180):
      t0 = time.time()
      while time.time() - t0 < timeout_s:
          try:
              r = requests.get(url, timeout=3)
              if r.status_code == 200:
                  return True
          except Exception:
              pass
          time.sleep(2)
      return False

  ready = wait_ready(HEALTH_URL)
  log_f.flush()

  if ready:
      print(f"sglang is READY on port {PORT}.")
      print(f"- Tail logs: !tail -n 50 {LOG}")
      print(f"- List models: !curl -s http://127.0.0.1:{PORT}/v1/models | jq .")
  else:
      print(f"sglang not ready after timeout. Showing last 60 log lines:")
      log_f.close()
      !tail -n 60 {LOG}

  # ---------- 4) Cleanup functions ----------
  def stop_server(p=proc):
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()
      print("Server stopped.")

  def full_cleanup(p=proc):
      # Stop server
      try:
          p.terminate()
          p.wait(timeout=10)
      except Exception:
          p.kill()

      # Also kill any lingering sglang processes
      subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)

      # Clear CUDA memory
      try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
      except:
          pass

      print("Server stopped and CUDA memory cleared.")

  print("Call stop_server() or full_cleanup() to shut it down gracefully.")

In [None]:
if START_SERVER:
    import requests
    import time
    
    def check_models():
        url = "http://127.0.0.1:8080/v1/models"
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            result = response.json()
    
            print("‚úÖ Server is responding!")
            print("Available models:")
            for model in result['data']:
                print(f"  - {model['id']}")
    
            return result['data'][0]['id'] if result['data'] else None
    
        except requests.exceptions.ConnectionError:
            print("‚ùå Connection failed - server may not be ready yet")
            return None
        except Exception as e:
            print(f"‚ùå Error: {e}")
            return None
    
    # Poll every 30 seconds until we get a model
    model_name = None
    while not model_name:
        model_name = check_models()
        if not model_name:
            print("‚è≥ Waiting 30 seconds before retrying...")
            time.sleep(30)
    
    print(f"\n‚úÖ Found model: {model_name}")

In [None]:
if TEST_INFERENCE:
    import time
    import requests
    
    url = "http://127.0.0.1:8080/v1/chat/completions"
    
    headers = {
        "Content-Type": "application/json"
    }
    
    messages = [
        {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
        {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10\u00d710 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
    ]
    
    payload = {
        "model": model_name,  # from your polling loop
        "messages": messages,
        # "max_tokens": 1000
        "max_tokens": 10
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload, timeout=600)
    end_time = time.time()
    
    response.raise_for_status()
    result = response.json()
    output_text = result["choices"][0]["message"]["content"]
    
    # Estimate token count (4 chars/token assumption)
    estimated_tokens = len(output_text) / 4
    elapsed_time = end_time - start_time
    tokens_per_second = estimated_tokens / elapsed_time
    
    print("‚úÖ Response received:")
    print(output_text)
    print(f"\n‚è± Elapsed time: {elapsed_time:.2f} seconds")
    print(f"üî¢ Estimated tokens: {estimated_tokens:.1f}")
    print(f"‚ö° Output tokens/sec: {tokens_per_second:.2f}")

In [None]:
if not IS_KAGGLE:
    %cd /workspace/arc-agi-2025

# Derive attempts/workers for the two modes
MAX_ATTEMPTS = 1 if (IS_RERUN or not IS_KAGGLE) else 8
MAX_WORKERS  = 16

# SUBSET = "test" # defaulting to test to ensure there are no loading issues.

# can use this instead if testing evaluation during a pre-run
SUBSET = "test" if IS_RERUN else "evaluation"

# Common env for your runner
os.environ["OPENAI_API_KEY"] = "EMPTY"

print(f"Mode: {'competition' if IS_RERUN else 'dev'} | attempts={MAX_ATTEMPTS} | workers={MAX_WORKERS} | subset={SUBSET}")

# Build the command
cmd_args = [
    "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
    "--dataset", DATASET,
    "--subset", SUBSET,
    "--max_workers", str(MAX_WORKERS),
    "--max_attempts", str(MAX_ATTEMPTS),
    "--model", model_name,
    "--base-url", "http://127.0.0.1:8080/v1",
    "--unsafe-executor",
    "--max-tokens", "2000",
    "--qwen-no-think"
]

print(f"Running command: {' '.join(cmd_args)}")

# Handle output redirection properly
if IS_RERUN or not IS_KAGGLE:
    # For quiet mode, redirect to file using subprocess
    import subprocess
    log_file_path = f"{SUBMIT_DIR}/run.log"
    print(f"üìù Logging output to: {log_file_path}")
    
    with open(log_file_path, "w") as log_file:
        process = subprocess.Popen(
            cmd_args,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
            cwd=os.getcwd()
        )
        
        # Wait for completion
        print("‚è≥ Running tasks (output being written to log file)...")
        return_code = process.wait()
        
    if return_code == 0:
        print(f"‚úÖ Task runner completed successfully. Check {log_file_path} for details.")
    else:
        print(f"‚ùå Task runner failed with return code {return_code}")
        print(f"üìù Check {log_file_path} for error details")
        # Show last few lines of log
        !tail -n 20 {log_file_path}
else:
    # For interactive mode, show output directly
    cmd = " ".join(cmd_args)
    print(f"Running: {cmd}\n")
    !{cmd}

In [None]:
# Fine-tuning Integration (Optional)
# Set ENABLE_FINE_TUNING=false to disable fine-tuning on non-transductive programs before running tasks

import time
import sys
import requests
import subprocess
from pathlib import Path

ENABLE_FINE_TUNING = os.getenv("ENABLE_FINE_TUNING", "true").lower() == "true"

if ENABLE_FINE_TUNING:
    print("üî¨ Fine-tuning enabled - will fine-tune model on non-transductive programs")
    
    # Set environment variables for fine-tuning
    fine_tuning_env = {
        'MODEL_SLUG': MODEL_PATH,              # Use the current model path as base
        'FINE_TUNING_MODE': 'final_only',     # Only save final merged model
        'DATA_SOURCE': 'database',            # Load from database
        'ARC_PROGRAMS_DB': str(ARC_PROGRAMS_DB),  # Database path
        'MODEL_SAVE_DIR': str(MODEL_SAVE_DIR), # Where to save fine-tuned model
    }
    
    print("üõ†Ô∏è Fine-tuning configuration:")
    for key, value in fine_tuning_env.items():
        print(f"   {key}: {value}")
        os.environ[key] = value
    
    # Stop the current server to free up GPU memory
    if 'proc' in locals():
        print("üõë Stopping inference server to free GPU memory for fine-tuning...")
        try:
            proc.terminate()
            proc.wait(timeout=10)
        except:
            proc.kill()
        
        # Clear CUDA memory
        try:
            import torch
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
                print("‚úÖ CUDA memory cleared")
        except:
            pass
    
    # Ensure we're in the right directory
    original_cwd = os.getcwd()
    if not IS_KAGGLE:
      os.chdir("/workspace/arc-agi-2025")
    
    # Set up logging
    import datetime
    log_dir = Path(os.environ.get("SUBMIT_DIR", "logs"))
    log_dir.mkdir(exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = log_dir / f"fine_tuning_{timestamp}.log"
    
    def log_and_print(message, log_file_handle=None):
      """Write to both console and log file"""
      print(message)
      if log_file_handle:
          log_file_handle.write(message + "\n")
          log_file_handle.flush()
    
    try:
      with open(log_file, 'w') as f:
          log_and_print(f"üìù Logging to: {log_file}", f)
    
          if not IS_KAGGLE:
              # Step 1: Convert notebook to script
              log_and_print("üîÑ Converting notebook to script...", f)
              convert_cmd = [
                  "uv", "run", "python",
                  "llm_python/fine-tuning/notebook_to_script.py",
                  "llm_python/fine-tuning/unsloth_arc_finetuning_soar.ipynb"
              ]
    
              convert_result = subprocess.run(convert_cmd,
                                            capture_output=True,
                                            text=True,
                                            timeout=60)
    
              # Log full output
              f.write("=== CONVERSION OUTPUT ===\n")
              f.write(f"Return code: {convert_result.returncode}\n")
              f.write(f"STDOUT:\n{convert_result.stdout}\n")
              f.write(f"STDERR:\n{convert_result.stderr}\n")
              f.write("========================\n\n")
              f.flush()
    
              if convert_result.returncode != 0:
                  log_and_print(f"‚ùå Notebook conversion failed: {convert_result.stderr}", f)
                  raise Exception("Notebook conversion failed")
    
              log_and_print("‚úÖ Notebook converted successfully", f)
    
          # Step 2: Run the actual fine-tuning
          log_and_print("üöÄ Starting fine-tuning...", f)
          fine_tuning_cmd = [
              "uv", "run", "python",
              "llm_python/fine-tuning/unsloth_arc_finetuning_soar.py",
              "--config", "llm_python/fine-tuning/config.yaml"
          ]
    
          log_and_print(f"Running command: {' '.join(fine_tuning_cmd)}", f)
    
          # Run with real-time output
          process = subprocess.Popen(
              fine_tuning_cmd,
              stdout=subprocess.PIPE,
              stderr=subprocess.STDOUT,
              text=True,
              bufsize=1
          )
    
          f.write("=== FINE-TUNING OUTPUT ===\n")
          f.flush()
    
          # Stream output to both console and file
          for line in process.stdout:
              print(line, end='')  # Show in console
              f.write(line)  # Save to file
              f.flush()
    
          # Wait for completion
          return_code = process.wait(timeout=6400)  # 2 hour timeout
    
          f.write(f"\n=== PROCESS COMPLETED WITH CODE: {return_code} ===\n")
          f.flush()
    
          if return_code == 0:
              log_and_print("‚úÖ Fine-tuning completed successfully!", f)
    
              # Find the fine-tuned model
              fine_tuned_models = list(Path(MODEL_SAVE_DIR).glob("*-final"))
              if fine_tuned_models:
                  new_model_path = fine_tuned_models[0]
                  log_and_print(f"üéØ Fine-tuned model saved at: {new_model_path}", f)
    
                  # Update MODEL_PATH to use the fine-tuned model
                  MODEL_PATH = str(new_model_path)
                  log_and_print(f"üîÑ Updated MODEL_PATH to use fine-tuned model: {MODEL_PATH}", f)
              else:
                  log_and_print("‚ö†Ô∏è  Fine-tuned model not found, continuing with original model", f)
          else:
              log_and_print(f"‚ùå Fine-tuning failed with return code {return_code}", f)
              log_and_print("üîÑ Continuing with original model...", f)
    
          log_and_print(f"üìÑ Full logs saved to: {log_file}", f)
            
    except subprocess.TimeoutExpired:
        print("‚è±Ô∏è Fine-tuning timed out after 2 hours, continuing with original model")
    except Exception as e:
        print(f"‚ùå Fine-tuning error: {e}")
        print("üîÑ Continuing with original model...")
    finally:
        # Restore original directory
        os.chdir(original_cwd)
    
    # Clean up environment variables
    for key in fine_tuning_env:
        os.environ.pop(key, None)
    
else:
    print("üî¨ Fine-tuning disabled (set ENABLE_FINE_TUNING=true to enable)")
    print(f"   Will use pre-loaded model: {MODEL_PATH}")

In [15]:
if ENABLE_FINE_TUNING:
    # Restart the server with the (potentially) new model
    if START_SERVER:
        print("üîÑ Restarting inference server with updated model...")
        
        # Clear any existing processes
        subprocess.run(["pkill", "-f", "sglang.launch_server"], capture_output=True)
        time.sleep(3)

        # Clear CUDA memory before starting
        try:
          import torch
          if torch.cuda.is_available():
              torch.cuda.empty_cache()
              torch.cuda.synchronize()
              print("CUDA memory cleared.")
          num_gpus = torch.cuda.device_count()
        except Exception:
          num_gpus = 0
        
        # Get variables from earlier cells
        PORT = 8080
        try:
            import torch
            num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
        except:
            num_gpus = 1
        
        # Restart server with new model
        LOG = f"{SUBMIT_DIR}/sglang_server.log"
        SERVER_CMD = [
            sys.executable, "-m", "sglang.launch_server",
            "--host", "0.0.0.0",
            "--port", str(PORT),
            "--model-path", MODEL_PATH,
            "--dp", str(max(1, min(num_gpus, 4))),
            "--kv-cache-dtype", "fp8_e4m3"
        ]
        
        print(f"Restarting with command: {' '.join(SERVER_CMD)}")
        
        log_f = open(LOG, "a")  # Append to existing log
        proc = subprocess.Popen(SERVER_CMD, stdout=log_f, stderr=subprocess.STDOUT, 
                               env=os.environ.copy(), cwd=SUBMIT_DIR)
        
        print(f"üîÑ Restarted server with PID={proc.pid}")
        
        # Wait for readiness
        def wait_ready(url, timeout_s=180):
            t0 = time.time()
            while time.time() - t0 < timeout_s:
                try:
                    r = requests.get(url, timeout=3)
                    if r.status_code == 200:
                        return True
                except Exception:
                    pass
                time.sleep(2)
            return False
        
        HEALTH_URL = f"http://127.0.0.1:{PORT}/v1/models"
        if wait_ready(HEALTH_URL):
            print("‚úÖ Server restarted successfully with fine-tuned model!")
            
            # Update model_name
            try:
                response = requests.get(HEALTH_URL)
                if response.status_code == 200:
                    models = response.json()['data']
                    if models:
                        model_name = models[0]['id']
                        print(f"üéØ Updated model name: {model_name}")
            except Exception as e:
                print(f"‚ö†Ô∏è  Could not update model name: {e}")
        else:
            print("‚ùå Failed to restart server")

üîÑ Restarting inference server with updated model...
CUDA memory cleared.
Restarting with command: /workspace/arc-agi-2025/.venv/bin/python3 -m sglang.launch_server --host 0.0.0.0 --port 8080 --model-path /workspace/arc-agi-2025/llm_python/fine-tuning/Qwen3-4B_ds-arc-agi-1-partial-100-c1542_ds-database-programs-final --dp 1 --kv-cache-dtype fp8_e4m3
üîÑ Restarted server with PID=365662
‚ùå Failed to restart server


In [12]:
if not IS_KAGGLE:
    %cd /workspace/arc-agi-2025

# Derive attempts/workers for the two modes
MAX_ATTEMPTS = 8 if (IS_RERUN or not IS_KAGGLE) else 8
MAX_WORKERS  = 16

# SUBSET = "test" # defaulting to test to ensure there are no loading issues.

# can use this instead if testing evaluation during a pre-run
SUBSET = "test" if IS_RERUN else "evaluation"

# Common env for your runner
os.environ["OPENAI_API_KEY"] = "EMPTY"

print(f"Mode: {'competition' if IS_RERUN else 'dev'} | attempts={MAX_ATTEMPTS} | workers={MAX_WORKERS} | subset={SUBSET}")

# Build the command
cmd_args = [
    "uv", "run", "python", "-u", "-m", "llm_python.run_arc_tasks_soar",
    "--dataset", DATASET,
    "--subset", SUBSET,
    "--max_workers", str(MAX_WORKERS),
    "--max_attempts", str(MAX_ATTEMPTS),
    "--model", model_name,
    "--base-url", "http://127.0.0.1:8080/v1",
    "--unsafe-executor",
    "--max-tokens", "2000",
    "--qwen-no-think"
]

print(f"Running command: {' '.join(cmd_args)}")

# Handle output redirection properly
if IS_RERUN or not IS_KAGGLE:
    # For quiet mode, redirect to file using subprocess
    import subprocess
    log_file_path = f"{SUBMIT_DIR}/run.log"
    print(f"üìù Logging output to: {log_file_path}")
    
    with open(log_file_path, "w") as log_file:
        process = subprocess.Popen(
            cmd_args,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
            cwd=os.getcwd()
        )
        
        # Wait for completion
        print("‚è≥ Running tasks (output being written to log file)...")
        return_code = process.wait()
        
    if return_code == 0:
        print(f"‚úÖ Task runner completed successfully. Check {log_file_path} for details.")
    else:
        print(f"‚ùå Task runner failed with return code {return_code}")
        print(f"üìù Check {log_file_path} for error details")
        # Show last few lines of log
        !tail -n 20 {log_file_path}
else:
    # For interactive mode, show output directly
    cmd = " ".join(cmd_args)
    print(f"Running: {cmd}\n")
    !{cmd}

/workspace/arc-agi-2025
Mode: dev | attempts=1 | workers=16 | subset=evaluation
Running command: uv run python -u -m llm_python.run_arc_tasks_soar --dataset arc-prize-2024 --subset evaluation --max_workers 16 --max_attempts 1 --model Trelis/Qwen3-4B_ds-arc-agi-1-partial-100-c1542 --base-url http://127.0.0.1:8080/v1 --unsafe-executor --max-tokens 2000 --qwen-no-think
üìù Logging output to: /workspace/arc-agi-2025/llm_python/submissions/run.log
‚è≥ Running tasks (output being written to log file)...
‚úÖ Task runner completed successfully. Check /workspace/arc-agi-2025/llm_python/submissions/run.log for details.


In [13]:
# Only score in dev/commit runs
if SCORE and not IS_RERUN:
    !uv run python -m llm_python.score_submission --submission "/kaggle/working/submission.json"
else:
    print("Skipping local scoring (competition rerun or SCORE=False).")

‚ùå Submission file not found: /kaggle/working/submission.json


In [None]:
# Final cleanup - stop server and free resources
if START_SERVER and 'full_cleanup' in globals():
    print("üßπ Cleaning up server and resources...")
    full_cleanup()
else:
    print("üîç No server cleanup needed (START_SERVER=False or cleanup function not available)")