# Qwen3 Tuning
for ARC AGI 2

Updates 29 July 2025:
- Now using utils from the repo.
- Updated lora r to 128, down from 256 as prob 128 is sufficient.

Updates 24 July 2025:
- Now supports calculation of metrics BUT requires validation data have code. This will be updated later to just pass grids for validation.

Updates 23 July 2025:
- Back to constant scheduler for one epoch. Note that SOAR use 3 epochs with cosine scheduler.

Updates 22 July 2025:
- Increased lora r to 256
- Added training on completions only
- Moved to cosine rather than constant scheduler. Moved from 1 to 2 epochs. Note that SOAR use 3 epochs with cosine scheduler.

In [None]:
# !git pull

In [None]:
## Now automatically set via Runpod Secrets
# !git config --global user.name “RonanMcGovern”
# !git config --global user.email "78278410+RonanKMcGovern@users.noreply.github.com"

In [None]:
import os
os.environ["HF_HOME"] = "/workspace"
os.environ["HF_HUB_CACHE"] = "/workspace/hub" # (recommended) override just the repo cache
print(os.environ["HF_HOME"])

### Installation

In [None]:
#To run with vllm.
!uv pip install unsloth vllm --system -qU

In [None]:
# # Temporary Fix while unsloth is broken! - SHOULD BE FIXED AS OF JULY 20TH 2025
# !rm -rf /tmp/unsloth_compiled_cache
# !uv pip uninstall trl unsloth --system -q
# !uv pip install unsloth -qU --system
# !uv pip install trl==0.19.1 --system -q

# # sometimes unsloth throws numpy issues
# !uv pip uninstall numpy --system -q
# !uv pip install numpy==2.2 --system -q
# !uv pip show numpy -q

In [None]:
# INSTALLED IN THE CONTAINER IF USING the [arc-agi-2025 container on runpod](https://console.runpod.io/deploy?template=bh0rvngapk&ref=jmfkcdio)
# %%capture
# import os
# !pip install uv -qU
# !uv pip install unsloth matplotlib tensorboard -qU --system
# !export HF_HUB_ENABLE_HF_TRANSFER=1

In [None]:
# # # if you face model download issues
# import os
# os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

In [None]:
from huggingface_hub import HfFolder, login

# Call this at the top of your script / notebook
if HfFolder.get_token() is None:   # no token cached or in $HF_TOKEN
    login()                        # interactive prompt

In [None]:
from IPython import get_ipython
ip = get_ipython()

ip.system_raw(
    "tensorboard "
    "--logdir ./logs "
    "--port 6006 "
    "--bind_all "
    "--reload_interval 5 "
    "> tb.out 2>&1 &"
)

print("TensorBoard is now running in the background on port 6006.")

### Unsloth

In [None]:
# !uv pip show trl unsloth vllm transformers

In [None]:
!nvidia-smi

In [None]:
import os
import unsloth
from unsloth import FastLanguageModel
import torch

model_slug = "Qwen/Qwen3-4B"
# model_slug = "julien31/Soar-qwen-7b"
# model_slug = "Qwen/Qwen2.5-Coder-7B-Instruct"
# model_slug = "Qwen/Qwen3-30B-A3B"

model_max_length = 32768 #default is ~2k for unsloth!!!
lora_rank = 128

# Training AND validation batch size (incl. for autoregressive train/test example metrics calculations)
batch_size_global = 2 # use 2 for 7/8B, use 4 for 4B on H200.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_slug,
    max_seq_length = model_max_length,   # Context length - can be longer, but uses more memory
    load_in_4bit = False,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # cache_dir = '/workspace',
    # token = "hf_...",      # use one if using gated models
    
    # for using fast_inference
    fast_inference = True, # allows for vLLM generation during evaluation
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.3,
)

In [None]:
!nvidia-smi

In [None]:
print(model.max_seq_length)

In [None]:
# # Print a summary of the transformer layers and key dimensions
# for i, block in enumerate(model.model.layers):
#     attn = block.self_attn
#     mlp = block.mlp

#     print(f"Layer {i}:")
#     print(f"  Attention:")
#     print(f"    q_proj: {attn.q_proj.weight.shape}")
#     print(f"    k_proj: {attn.k_proj.weight.shape}")
#     print(f"    v_proj: {attn.v_proj.weight.shape}")
#     print(f"    out_proj: {attn.o_proj.weight.shape}")
#     print(f"  MLP:")
#     print(f"    fc1: {mlp.gate_proj.weight.shape}")
#     print(f"    fc2: {mlp.up_proj.weight.shape}")
#     print(f"    fc3: {mlp.down_proj.weight.shape}")
#     print()

In [None]:
# print(model)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128. could consider 128.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                     ],
    lora_alpha = 64,  # Best to choose alpha = rank or rank*2. EXCEPT if using rslora, in which case set it as sqrt(max matrix dimension). 64 is good for Qwen 4B
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

In [None]:
print(tokenizer.padding_side)

In [None]:
# Import utils using standard project root detection
from pathlib import Path
import sys

# Find project root by looking for pyproject.toml
project_root = next(
    (parent for parent in [Path.cwd()] + list(Path.cwd().parents) 
     if (parent / "pyproject.toml").exists()), 
    Path.cwd()
)

# Add llm_python directory to path (where utils is located)
llm_python_dir = project_root / "llm_python"
sys.path.insert(0, str(llm_python_dir))

print(f"📁 Project root: {project_root}")
print(f"📁 Looking for utils in: {llm_python_dir}")

from utils.task_loader import TaskLoader
from utils.scoring import GridScorer, ProgramExecutor
from utils.prompt_utils import create_arc_prompt, extract_python_code
from utils.metrics_utils import calculate_task_metrics, format_metrics_display, metrics_to_percentages
from utils.timeout_utils import execute_with_timeout
from utils.transduction import is_transduction_cheating
from utils.prompt_loader import PromptLoader

# Initialize utility instances
prompt_loader = PromptLoader()
scorer = GridScorer()
print("✅ Utils imported and initialized successfully")

<a name="Data"></a>
### Data Prep

In [None]:
import re

def clean_multiple_newlines(code: str) -> str:
    """Remove multiple consecutive newlines and replace with at most one empty line."""
    # Pattern to match multiple consecutive newlines with optional whitespace
    # This handles cases like \n\n\n, \n  \n\n, \n\t\n\n\n etc.
    pattern = r'\n(\s*\n)+'
    # Replace with at most one empty line (two newlines)
    cleaned = re.sub(pattern, '\n\n', code)
    return cleaned

def count_tokens(text: str, tokenizer) -> int:
    """Count tokens in text using the provided tokenizer."""
    return len(tokenizer.encode(text))

def should_filter_code(code: str, tokenizer, max_tokens: int = 1000) -> bool:
    """Check if code should be filtered based on token count."""
    return count_tokens(code, tokenizer) > max_tokens

print("✅ Added code cleaning and filtering functions")


In [None]:
  from pathlib import Path
  import json
  from typing import Optional
  from datasets import load_dataset, DatasetDict

  # ---------------------------------------------------------------------
  # Config (examples)
  # ---------------------------------------------------------------------

  # Set max_rows flag to limit train size. None for all
  max_rows = None # None for all rows
  max_validation_rows = 32

  # CASE 2: two different slugs
  train_slug = "Trelis/arc-programs-correct-50"
  val_slug   = "Trelis/grids_only_arc-agi-1_shortest_evaluation_30_20250807_000221"

  enable_thinking = False  # See note in original code

  # ---------------------------------------------------------------------
  # Prompt management using utils (replacing hard-coded prompts)
  # ---------------------------------------------------------------------

  # Use prompt_loader to get SOAR prompts from utils
  SYSTEM_PROMPT = prompt_loader.get_system_message("soar")
  INITIAL_TURN_PROMPT = prompt_loader.get_initial_turn_prompt("soar")

  print(f"✅ Using SOAR prompts from utils:")
  print(f"   System prompt: {len(SYSTEM_PROMPT)} chars")
  print(f"   Initial turn prompt: {len(INITIAL_TURN_PROMPT)} chars")

  def hf_train_dataset_to_chat_dataset(dataset_slug, split="train"):
      """For training datasets that need TaskLoader for grids"""
      ds = load_dataset(dataset_slug, split=split)
      from utils.task_loader import TaskLoader
      task_loader = TaskLoader()
      
      # Statistics tracking for data cleaning
      stats = {
          'total_examples': 0,
          'removed_too_long': 0,
          'cleaned_newlines': 0
      }

      def create_train_chat_messages(example):
          nonlocal stats
          stats['total_examples'] += 1
          
          task_id = example["task_id"]

          # Training set - load grids from TaskLoader, use predicted outputs from dataset
          try:
              task_data = task_loader.load_task(task_id, dataset="arc-agi-1")
          except FileNotFoundError as e:
              print(f"Warning: Could not load grids for task {task_id}: {e}")
              return None

          # Clean the code before processing (ONLY for training data)
          original_code = example['code']
          cleaned_code = clean_multiple_newlines(original_code)
          
          # Track if cleaning was applied
          if cleaned_code != original_code:
              stats['cleaned_newlines'] += 1
          
          # Filter out examples with code that's too long (ONLY for training data)
          if should_filter_code(cleaned_code, tokenizer, max_tokens=1000):
              stats['removed_too_long'] += 1
              return None

          # Use predicted outputs from dataset for training
          train_outputs = example.get("predicted_train_output", [ex["output"] for ex in task_data["train"]])
          test_outputs = example.get("predicted_test_output", [ex["output"] for ex in task_data["test"]])

          task_data_for_prompt = {
              'train': [{'input': inp, 'output': out}
                       for inp, out in zip([ex["input"] for ex in task_data["train"]], train_outputs)],
              'test': [{'input': inp, 'output': out}
                      for inp, out in zip([ex["input"] for ex in task_data["test"]], test_outputs)]
          }

          # Use create_arc_prompt from utils
          system_content, user_content = create_arc_prompt(task_data_for_prompt, prompt_loader, "soar")

          messages = [
              {"role": "system", "content": system_content},
              {"role": "user", "content": user_content},
              {"role": "assistant", "content": f"```python\n{cleaned_code}\n```"}
          ]

          # Apply chat template
          text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
          prompt_messages = messages[:-1]  # Remove assistant message
          prompt_text = tokenizer.apply_chat_template(
              prompt_messages,
              tokenize=False,
              add_generation_prompt=True,
              enable_thinking=enable_thinking
          )

          return {
              "messages": messages,
              "text": text,
              "prompt": prompt_text,
              "train_input": [ex["input"] for ex in task_data_for_prompt["train"]],
              "train_output": train_outputs,
              "test_input": [ex["input"] for ex in task_data_for_prompt["test"]],
              "test_output": test_outputs,
              "task_id": task_id,
          }

      ds = ds.map(create_train_chat_messages, desc=f"build train chat fields ({split})")
      ds_filtered = ds.filter(lambda x: x is not None)
      
      # Print cleaning statistics
      print(f"\n📊 Training data cleaning statistics:")
      print(f"   Total examples processed: {stats['total_examples']}")
      print(f"   Examples with cleaned newlines: {stats['cleaned_newlines']}")
      print(f"   Examples removed (too long): {stats['removed_too_long']}")
      print(f"   Examples retained: {len(ds_filtered)}")
      
      return ds_filtered


  def hf_val_dataset_to_chat_dataset(dataset_slug, split="train"):
      """For validation datasets that already have grids - use them directly"""
      # NOTE: NO CLEANING APPLIED TO VALIDATION DATA
      ds = load_dataset(dataset_slug, split=split)

      def create_val_chat_messages(example):
          task_id = example["task_id"]

          # Validation set - use everything directly from the dataset (no cleaning)
          task_data_for_prompt = {
              'train': [{'input': inp, 'output': out}
                       for inp, out in zip(example["train_input"], example["train_output"])],
              'test': [{'input': inp, 'output': out}
                      for inp, out in zip(example["test_input"], example["test_output"])]
          }

          # Use create_arc_prompt from utils
          system_content, user_content = create_arc_prompt(task_data_for_prompt, prompt_loader, "soar")

          messages = [
              {"role": "system", "content": system_content},
              {"role": "user", "content": user_content},
              {"role": "assistant", "content": f"```python\n{example['code']}\n```"}
          ]

          # Apply chat template
          text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
          prompt_messages = messages[:-1]  # Remove assistant message
          prompt_text = tokenizer.apply_chat_template(
              prompt_messages,
              tokenize=False,
              add_generation_prompt=True,
              enable_thinking=enable_thinking
          )

          return {
              "messages": messages,
              "text": text,
              "prompt": prompt_text,
              "train_input": example["train_input"],      # Direct from dataset
              "train_output": example["train_output"],    # Direct from dataset
              "test_input": example["test_input"],        # Direct from dataset
              "test_output": example["test_output"],      # Direct from dataset
              "task_id": task_id,
          }

      ds = ds.map(create_val_chat_messages, desc=f"build val chat fields ({split})")
      return ds


  def build_dataset(train_slug: str, val_slug: str) -> DatasetDict:
      """Build dataset with separate train and validation slugs"""

      # Load training dataset (with cleaning applied)
      train_ds = hf_train_dataset_to_chat_dataset(train_slug, split="train")
      if max_rows:
          train_ds = train_ds.select(range(min(len(train_ds), max_rows)))

      # Load validation dataset (no cleaning applied)
      val_ds = hf_val_dataset_to_chat_dataset(val_slug, split="train")
      if max_validation_rows:
          val_ds = val_ds.select(range(min(len(val_ds), max_validation_rows)))

      return DatasetDict(train=train_ds, validation=val_ds)

  # ---------------------------------------------------------------------
  # Build the dataset
  # ---------------------------------------------------------------------
  data = build_dataset(train_slug, val_slug)


In [None]:
# Validation split
val_ids = [ex["task_id"] for ex in data["validation"]]
assert all(val_ids), "❌ some validation rows are missing task_id"
assert len(val_ids) == len(set(val_ids)), "❌ duplicate task_id in validation slice"

In [None]:
# # print(data["train"][0])
# print(data["train"][0]['prompt'])

In [None]:
# print(data["train"][0]['text'])

In [None]:
print(data["validation"][0]['text'])

Let's see the structure of both datasets:

In [None]:
import numpy as np
from statistics import median

def length_stats(dataset, name=""):
    """
    Return min / median / max tokenised length for a 🤗 Dataset split that has a
    single 'text' column. Uses the same tokenizer already in memory.
    """
    # Tokenise in batches → list of list[int] → list[int] lengths
    lengths = dataset.map(
        lambda batch: {
            "len": [len(ids) for ids in tokenizer(batch["text"],
                                                  add_special_tokens=False
                                                 )["input_ids"]]
        },
        batched=True,
        remove_columns=dataset.column_names,   # drop 'text'
        keep_in_memory=True,
    )["len"]

    print(f"{name:>11}:  min={min(lengths):>4}  "
          f"median={int(median(lengths)):>4}  max={max(lengths):>4}")

# ── run for both splits ────────────────────────────────────────────────────────
length_stats(data["train"],       "train")
length_stats(data["validation"],  "validation")


### Pre-Training Data Integrity Tests
Before training, let's test the ground-truth code on a random sample of training examples to validate dataset quality and establish baseline performance.


In [None]:
import random

# Configuration for pre-training tests
NUM_TEST_EXAMPLES = 8  # Number of random examples to test
RANDOM_SEED = 42  # For reproducible results

def run_pre_training_data_integrity_tests(dataset_split="train", num_examples=NUM_TEST_EXAMPLES):
    """
    Test ground-truth code from dataset on random examples to validate data quality.
    
    Args:
        dataset_split: Which split to test (should be "train" since validation has no ground-truth code)
        num_examples: Number of random examples to test
    """
    print(f"🧪 Running Pre-Training Data Integrity Tests")
    print(f"📊 Testing {num_examples} random examples from {dataset_split} split")
    print("=" * 60)
    
    # Set seed for reproducible sampling
    random.seed(RANDOM_SEED)
    
    # Get the dataset split
    dataset = data[dataset_split]
    
    # Randomly sample examples
    total_examples = len(dataset)
    if num_examples > total_examples:
        print(f"⚠️  Requested {num_examples} examples but only {total_examples} available. Testing all.")
        sample_indices = list(range(total_examples))
    else:
        sample_indices = random.sample(range(total_examples), num_examples)
    
    # Initialize tracking variables
    results = []
    executor = ProgramExecutor(timeout=2.0, executor_type="unrestricted")
    
    print(f"\n🔍 Testing {len(sample_indices)} examples...\n")
    
    for i, idx in enumerate(sample_indices):
        example = dataset[idx]
        task_id = example.get("task_id", f"idx_{idx}")
        code = example["code"]
        
        print(f"[{i+1}/{len(sample_indices)}] Testing {task_id}")
        
        # Initialize results for this example
        example_result = {
            "task_id": task_id,
            "index": idx,
            "code": code,
            "train_results": [],
            "test_results": [],
            "train_success": 0,
            "test_success": 0,
            "code_executed": False,
            "errors": []
        }
        
        # Test on training examples
        train_correct = 0
        for t_idx, (train_in, train_out) in enumerate(zip(example["train_input"], example["train_output"])):
            try:
                predicted_output, error, timed_out = executor.execute_program_with_timeout(code, train_in)
                
                if predicted_output is not None:
                    example_result["code_executed"] = True
                    score_result = scorer.score_grid(predicted_output, train_out)
                    is_correct = score_result["correct"]
                    
                    if is_correct:
                        train_correct += 1
                    
                    example_result["train_results"].append({
                        "index": t_idx,
                        "correct": is_correct,
                        "predicted": predicted_output,
                        "expected": train_out,
                        "timed_out": timed_out
                    })
                else:
                    example_result["train_results"].append({
                        "index": t_idx,
                        "correct": False,
                        "error": error,
                        "timed_out": timed_out
                    })
                    if error:
                        example_result["errors"].append(f"Train {t_idx}: {error}")
                        
            except Exception as e:
                example_result["train_results"].append({
                    "index": t_idx,
                    "correct": False,
                    "error": str(e)
                })
                example_result["errors"].append(f"Train {t_idx}: {str(e)}")
        
        # Test on test examples
        test_correct = 0
        for t_idx, (test_in, test_out) in enumerate(zip(example["test_input"], example["test_output"])):
            try:
                predicted_output, error, timed_out = executor.execute_program_with_timeout(code, test_in)
                
                if predicted_output is not None:
                    example_result["code_executed"] = True
                    score_result = scorer.score_grid(predicted_output, test_out)
                    is_correct = score_result["correct"]
                    
                    if is_correct:
                        test_correct += 1
                    
                    example_result["test_results"].append({
                        "index": t_idx,
                        "correct": is_correct,
                        "predicted": predicted_output,
                        "expected": test_out,
                        "timed_out": timed_out
                    })
                else:
                    example_result["test_results"].append({
                        "index": t_idx,
                        "correct": False,
                        "error": error,
                        "timed_out": timed_out
                    })
                    if error:
                        example_result["errors"].append(f"Test {t_idx}: {error}")
                        
            except Exception as e:
                example_result["test_results"].append({
                    "index": t_idx,
                    "correct": False,
                    "error": str(e)
                })
                example_result["errors"].append(f"Test {t_idx}: {str(e)}")
        
        # Calculate success rates for this example
        example_result["train_success"] = train_correct / len(example["train_input"]) if example["train_input"] else 0
        example_result["test_success"] = test_correct / len(example["test_input"]) if example["test_input"] else 0
        
        # Print summary for this example
        total_train = len(example["train_input"])
        total_test = len(example["test_input"])
        
        print(f"  ✅ Train: {train_correct}/{total_train} ({example_result['train_success']:.1%})")
        print(f"  ✅ Test:  {test_correct}/{total_test} ({example_result['test_success']:.1%})")
        
        if example_result["errors"]:
            print(f"  ❌ Errors: {len(example_result['errors'])}")
        if not example_result["code_executed"]:
            print(f"  ⚠️  Code never executed successfully")
        print()
        
        results.append(example_result)
    
    return results

# Run the tests
data_integrity_results = run_pre_training_data_integrity_tests("train", NUM_TEST_EXAMPLES)

In [None]:
def analyze_data_integrity_results(results):
    """
    Analyze and display comprehensive statistics from the data integrity tests.
    """
    print("=" * 60)
    print("📈 PRE-TRAINING DATA INTEGRITY RESULTS ANALYSIS")
    print("=" * 60)
    
    if not results:
        print("❌ No results to analyze!")
        return
    
    # Overall statistics
    total_examples = len(results)
    examples_with_executable_code = sum(1 for r in results if r["code_executed"])
    examples_with_errors = sum(1 for r in results if r["errors"])
    
    # Training performance statistics
    train_success_rates = [r["train_success"] for r in results]
    perfect_train = sum(1 for rate in train_success_rates if rate == 1.0)
    partial_train = sum(1 for rate in train_success_rates if 0 < rate < 1.0)
    failed_train = sum(1 for rate in train_success_rates if rate == 0.0)
    
    # Test performance statistics  
    test_success_rates = [r["test_success"] for r in results]
    perfect_test = sum(1 for rate in test_success_rates if rate == 1.0)
    partial_test = sum(1 for rate in test_success_rates if 0 < rate < 1.0)
    failed_test = sum(1 for rate in test_success_rates if rate == 0.0)
    
    # Calculate overall metrics
    avg_train_success = sum(train_success_rates) / len(train_success_rates) if train_success_rates else 0
    avg_test_success = sum(test_success_rates) / len(test_success_rates) if test_success_rates else 0
    
    # Count total grids tested
    total_train_grids = sum(len(r["train_results"]) for r in results)
    total_test_grids = sum(len(r["test_results"]) for r in results)
    correct_train_grids = sum(sum(tr["correct"] for tr in r["train_results"]) for r in results)
    correct_test_grids = sum(sum(tr["correct"] for tr in r["test_results"]) for r in results)
    
    print(f"\n🎯 OVERALL PERFORMANCE:")
    print(f"   Examples tested: {total_examples}")
    print(f"   Code executable: {examples_with_executable_code}/{total_examples} ({examples_with_executable_code/total_examples:.1%})")
    print(f"   Examples with errors: {examples_with_errors}/{total_examples} ({examples_with_errors/total_examples:.1%})")
    
    print(f"\n📊 TRAINING GRIDS PERFORMANCE:")
    print(f"   Average success rate: {avg_train_success:.1%}")
    print(f"   Perfect examples (100%): {perfect_train}/{total_examples} ({perfect_train/total_examples:.1%})")
    print(f"   Partial examples (>0% <100%): {partial_train}/{total_examples} ({partial_train/total_examples:.1%})")
    print(f"   Failed examples (0%): {failed_train}/{total_examples} ({failed_train/total_examples:.1%})")
    print(f"   Grid-level accuracy: {correct_train_grids}/{total_train_grids} ({correct_train_grids/total_train_grids:.1%})")
    
    print(f"\n🎯 TEST GRIDS PERFORMANCE:")
    print(f"   Average success rate: {avg_test_success:.1%}")
    print(f"   Perfect examples (100%): {perfect_test}/{total_examples} ({perfect_test/total_examples:.1%})")
    print(f"   Partial examples (>0% <100%): {partial_test}/{total_examples} ({partial_test/total_examples:.1%})")
    print(f"   Failed examples (0%): {failed_test}/{total_examples} ({failed_test/total_examples:.1%})")
    print(f"   Grid-level accuracy: {correct_test_grids}/{total_test_grids} ({correct_test_grids/total_test_grids:.1%})")
    
    # Detailed breakdown by example
    print(f"\n📋 DETAILED BREAKDOWN BY EXAMPLE:")
    print("-" * 60)
    
    for i, result in enumerate(results):
        task_id = result["task_id"]
        train_rate = result["train_success"]
        test_rate = result["test_success"]
        executed = "✅" if result["code_executed"] else "❌"
        error_count = len(result["errors"])
        
        print(f"[{i+1:2d}] {task_id}")
        print(f"     Train: {train_rate:5.1%} | Test: {test_rate:5.1%} | Executed: {executed} | Errors: {error_count}")
        
        if result["errors"] and len(result["errors"]) <= 3:  # Show first few errors
            for error in result["errors"][:3]:
                print(f"     Error: {error}")
        elif len(result["errors"]) > 3:
            print(f"     Errors: {result['errors'][0]} ... (+{len(result['errors'])-1} more)")
    
    # Quality assessment
    print(f"\n🔍 DATASET QUALITY ASSESSMENT:")
    print("-" * 60)
    
    if avg_train_success > 0.9:
        print("✅ EXCELLENT: Ground-truth code performs very well on training examples")
    elif avg_train_success > 0.7:
        print("✅ GOOD: Ground-truth code performs well on training examples")
    elif avg_train_success > 0.5:
        print("⚠️  MODERATE: Ground-truth code has mixed performance on training examples")
    else:
        print("❌ POOR: Ground-truth code has low performance on training examples")
    
    if avg_test_success > 0.9:
        print("✅ EXCELLENT: Ground-truth code generalizes very well to test examples")
    elif avg_test_success > 0.7:
        print("✅ GOOD: Ground-truth code generalizes well to test examples")
    elif avg_test_success > 0.5:
        print("⚠️  MODERATE: Ground-truth code has mixed generalization to test examples")
    else:
        print("❌ POOR: Ground-truth code has poor generalization to test examples")
    
    if examples_with_executable_code == total_examples:
        print("✅ EXCELLENT: All ground-truth code is executable")
    elif examples_with_executable_code / total_examples > 0.9:
        print("✅ GOOD: Most ground-truth code is executable")
    else:
        print("⚠️  ISSUE: Some ground-truth code is not executable")
    
    print("\n" + "=" * 60)
    
    return {
        "total_examples": total_examples,
        "executable_rate": examples_with_executable_code / total_examples,
        "avg_train_success": avg_train_success,
        "avg_test_success": avg_test_success,
        "perfect_train_rate": perfect_train / total_examples,
        "perfect_test_rate": perfect_test / total_examples,
        "train_grid_accuracy": correct_train_grids / total_train_grids if total_train_grids > 0 else 0,
        "test_grid_accuracy": correct_test_grids / total_test_grids if total_test_grids > 0 else 0
    }

# Analyze the results
summary_stats = analyze_data_integrity_results(data_integrity_results)


In [None]:
# Note: Validation set typically doesn't have ground-truth programs, so we only test training set

# Function to examine specific failing examples in detail
def examine_failure(results, example_index):
    """Examine a specific failing example in detail."""
    if example_index >= len(results):
        print(f"❌ Invalid index {example_index}. Only {len(results)} examples available.")
        return
    
    result = results[example_index]
    print(f"\n🔍 DETAILED EXAMINATION: Example {example_index + 1}")
    print(f"Task ID: {result['task_id']}")
    print("=" * 50)
    
    print(f"\n📝 GROUND TRUTH CODE:")
    print("-" * 30)
    print(result['code'])
    
    print(f"\n📊 EXECUTION SUMMARY:")
    print(f"Code executed successfully: {result['code_executed']}")
    print(f"Train success rate: {result['train_success']:.1%}")
    print(f"Test success rate: {result['test_success']:.1%}")
    print(f"Number of errors: {len(result['errors'])}")
    
    if result['errors']:
        print(f"\n❌ ERRORS:")
        for i, error in enumerate(result['errors']):
            print(f"  {i+1}. {error}")

# Check for failing examples
failed_examples = [i for i, r in enumerate(data_integrity_results) 
                  if r['train_success'] < 1.0 or r['test_success'] < 1.0 or not r['code_executed']]

print(f"\n🔍 FAILING EXAMPLES SUMMARY:")
if failed_examples:
    print(f"Found {len(failed_examples)} examples with issues: {failed_examples}")
    print("To examine a specific failure, run: examine_failure(data_integrity_results, index)")
else:
    print("🎉 No failing examples found! All ground-truth code works perfectly.")

print(f"\n✅ Pre-training data integrity tests complete!")
print(f"📋 Summary stats saved in 'summary_stats' variable")
print(f"📊 Detailed results saved in 'data_integrity_results' variable")

In [None]:
# examine_failure(data_integrity_results, 0)

### Training Setup
Now we'll set up the trainer and then validate all evaluation components.

In [None]:
# print(help(model.fast_generate))

In [None]:
from datetime import datetime
import re

# Extract date and time using regex
print("Extract training set date and time as dataset identifiers")

# Try pattern 1: timestamp at end (original pattern)
match = re.search(r'(\d{8}_\d{6})', train_slug)
if match:
  timestamp = match.group(1)
  date_str = timestamp[:8]
  time_str = timestamp[9:]
  print(f"Date: {date_str} (YYYYMMDD)")
  print(f"Time: {time_str} (HHMMSS)")
else:
  # Try pattern 2: SOAR dataset format (soar-YYYYMMDD_HHMMSS-rows)
  match = re.search(r'soar-(\d{8}_\d{6})-\d+', train_slug)
  if match:
      timestamp = match.group(1)
      date_str = timestamp[:8]
      time_str = timestamp[9:]
      print(f"Date: {date_str} (YYYYMMDD)")
      print(f"Time: {time_str} (HHMMSS)")
  else:
      # No timestamp found - use dataset name
      print("No timestamp found, using dataset name.")
      dataset_name = train_slug.split('/')[-1]  # Get name part after last slash
      date_str = dataset_name
      time_str = ""

run_name = f"{model_slug.split('/')[-1]}_ds{date_str}{time_str}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
print(f"Run name will be {run_name}")

In [None]:
# Legacy code extraction imports - now using utils.prompt_utils

In [None]:
import torch, subprocess, os, gc, time

def _print_gpu(prefix=""):
    alloc = torch.cuda.memory_allocated() / 2**20  # MiB
    reserved = torch.cuda.memory_reserved() / 2**20
    print(f"{prefix}CUDA‑alloc={alloc:.0f} MiB | reserved={reserved:.0f} MiB")

def _nvidia_smi():
    try:
        smi = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used,memory.free",
             "--format=csv,noheader,nounits"]).decode().strip()
        print("nvidia-smi (used/free MiB):", smi)
    except Exception:
        pass  # nvidia-smi not always available


TEMPLATES = {
    "llama": (
        "<|start_header_id|>user<|end_header_id|>\n\n",
        "<|start_header_id|>assistant<|end_header_id|>\n\n",
    ),
    "gemma": (
        "<start_of_turn>user\n",
        "<start_of_turn>model\n",
    ),
    "qwen-coder": (
        "<|im_start|>user\n",
        "<|im_start|>assistant\n", # this is actually how you properly allow the model to keep reasoning!
    ),
    "qwen": (
        "<|im_start|>user\n",
        "<|im_start|>assistant\n<think>\n\n</think>\n\n", # this is actually how you properly allow the model to keep reasoning!
    ),
    "mistral": (
        "[INST]",
        "[/INST]",
    )
}

# instruction_tag, response_tag = TEMPLATES["qwen-coder"]   # ← change if needed and comment out below

model_slug_lower = model_slug.lower()

if "qwen" in model_slug_lower:
    if "coder" in model_slug_lower:
        instruction_tag, response_tag = TEMPLATES["qwen-coder"]
    elif "soar-qwen" in model_slug_lower:
        instruction_tag, response_tag = TEMPLATES["qwen-coder"]
    else:
        instruction_tag, response_tag = TEMPLATES["qwen"]
else:
    raise ValueError(f"Unsupported model slug for Qwen template: {model_slug}")

In [None]:
print(f"Response tag selected: {response_tag}")

In [None]:
import torch, tempfile, shutil
import torch.nn.functional as F
from trl import SFTTrainer
from vllm import SamplingParams
from unsloth_zoo.vllm_utils import generate_batches

MAX_NEW_TOKENS     = 1000
NUM_EVAL_ATTEMPTS  = 8
SAMPLING_PARAMS    = SamplingParams(
    temperature=1.0, min_p=0.05, max_tokens=MAX_NEW_TOKENS
)
DEBUG=False

class VllmSFTTrainer(SFTTrainer):
    """
    • Builds prompts by slicing at the first label that is not -100
    • Generates NUM_EVAL_ATTEMPTS completions per prompt with vLLM
    • Returns a single int64 prediction tensor; labels=None
    """

    # ── snapshot LoRA once per evaluate() run ──────────────────────────
    def evaluate(self, **kwargs):
        tmp = tempfile.mkdtemp()
        try:
            self.model.save_lora(tmp)
            self._lora_req = self.model.load_lora(tmp)
            return super().evaluate(**kwargs)
        finally:
            shutil.rmtree(tmp, ignore_errors=True)

    # ── helper: build text prompts quickly ─────────────────────────────
    def _build_prompts(self, input_ids, label_ids):
        tok   = self.tokenizer
        bos   = tok.bos_token or "<s>"
        prompts = []

        for ids_row, lbl_row in zip(input_ids, label_ids):
            # cut at first non--100 label
            try:
                cut = (lbl_row != -100).nonzero(as_tuple=True)[0][0].item()
            except IndexError:                 # all -100 → full row
                cut = ids_row.size(0)

            prompt = tok.decode(
                ids_row[:cut], skip_special_tokens=False
            ).lstrip()
            if prompt.startswith(bos):         # drop duplicate BOS
                prompt = prompt[len(bos):]
            prompts.append(prompt or bos)      # never empty
        return prompts

    # ── main override ─────────────────────────────────────────────────────
    def prediction_step(self, model, inputs,
                        prediction_loss_only=False, ignore_keys=None):
    
        if prediction_loss_only:
            return (None, None, None)
    
        ids    = inputs["input_ids"].to(self.args.device)
        labels = inputs["labels"]
        prompts = self._build_prompts(ids, labels)
    
        if DEBUG:
            print(f"[DEBUG] generating {NUM_EVAL_ATTEMPTS} attempt(s) per prompt")
            print("\n[PROMPT-DBG] first prompt:\n", prompts[0], "…")
            if len(prompts) > 1:
                print("\n[PROMPT-DBG] second prompt:\n", prompts[1], "…")
            print("[PROMPT-DBG] prompt length (chars):", len(prompts[0]))
    
        # --- generate k batches -------------------------------------------------
        batch_results = []
        for b in range(NUM_EVAL_ATTEMPTS):
            outs = model.fast_generate(
                prompts,
                sampling_params=SAMPLING_PARAMS,
                lora_request=getattr(self, "_lora_req", None),
            )
            batch_results.append([o.outputs[0].token_ids for o in outs])
    
            if DEBUG and ids.size(0) > 0:          # preview first task in this batch
                tok_preview = outs[0].outputs[0].token_ids[:120]
                print(f"\n[GEN-DBG] batch {b+1}/{NUM_EVAL_ATTEMPTS}  raw ids:",
                      tok_preview[:20], "… len", len(tok_preview))
                print("[GEN-DBG] preview:",
                      self.tokenizer.decode(tok_preview, skip_special_tokens=True)[:200], "…")
    
        # --- transpose to task-major order -------------------------------------
        # batch_results shape: [attempt][task]  →  want [task][attempt]
        transposed = list(zip(*batch_results))           # len = num_tasks
        all_tok_lists = [seq for task_seqs in transposed for seq in task_seqs]
    
        # --- pad / truncate -----------------------------------------------------
        pad_id  = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id or 0
        total   = len(all_tok_lists)                     # tasks × attempts
        pred_batch = torch.full(
            (total, MAX_NEW_TOKENS),
            pad_id, dtype=torch.long, device=ids.device
        )
        for i, seq in enumerate(all_tok_lists):
            seq_t = torch.tensor(seq, dtype=torch.long, device=ids.device)
            pred_batch[i, : min(len(seq), MAX_NEW_TOKENS)] = seq_t[:MAX_NEW_TOKENS]
    
        if DEBUG:
            print("[SHAPE-DBG] pred_batch:", tuple(pred_batch.shape))
    
        dummy_labels = torch.full_like(pred_batch, -100)
        return (None, pred_batch, dummy_labels)

In [None]:
import os, logging, multiprocessing as mp
from functools import partial
from concurrent.futures import ProcessPoolExecutor
from typing import Dict

logger = logging.getLogger(__name__)

# ---------- tight helpers --------------------------------------------
from utils.scoring import GridScorer          # already imported elsewhere

def _debug_attempt(task_idx: int,
                   attempt_idx: int,
                   ex: Dict,
                   att: Dict,
                   scorer: GridScorer):
    header = f"[DBG] Task {task_idx:02d}  Attempt {attempt_idx+1}"
    print("\n" + "-" * len(header))
    print(header)
    print("-" * len(header))

    print("TRAIN grids:")
    for i, gold in enumerate(ex["train_output"]):
        pred = att["train_results"][i]["prediction"]
        ok   = att["train_results"][i]["correct"]
        print(f"  • ex{i}: {'✓' if ok else '✗'}\n{pred}\n")

    print("TEST grids:")
    for i, gold in enumerate(ex["test_output"]):
        pred = att["test_results"][i]["prediction"]
        ok   = att["test_results"][i]["correct"]
        print(f"  • test{i}: {'✓' if ok else '✗'}\n{pred}\n")

    print(f"train_acc = {att['train_accuracy']:.2f}  "
          f"all_test_correct = {att['all_test_correct']}")
    print(f"transductive = {att['is_transductive']}  "
          f"timeout = {att['any_timeout']}")
    print(f"program_extracted = {att['program_extracted']}, "
          f"code_ran = {att['code_ran']}")
    print("-" * len(header) + "\n")

def _clean_tokens(row, pad_id, vocab_n):
    """Drop pad_id and -100; assert remaining IDs are valid."""
    keep = (row != pad_id) & (row != -100)
    clean = row[keep]
    bad = clean[(clean < 0) | (clean >= vocab_n)]
    assert len(bad) == 0, f"invalid ids {bad.tolist()}"
    return clean

def _decode_one(tok, row, pad_id, vocab_n):
    ids = _clean_tokens(row, pad_id, vocab_n)
    return tok.decode(ids.tolist(), skip_special_tokens=True)

# ---------- main metric fn -------------------------------------------
def compute_arc_metrics(eval_pred, *, debug=False):
    preds, _ = eval_pred                   # labels are None
    tok      = trainer.tokenizer
    pad_id   = tok.pad_token_id or tok.eos_token_id or 0
    vocab_n  = len(tok)

    # 1 · decode in parallel (CPU-heavy)
    def _filter(row):
        row = row.tolist()                     # torch.Tensor → list[int]
        return [t for t in row
                if t != pad_id and t != -100 and 0 <= t < vocab_n]
    
    filtered = [_filter(row) for row in preds]
    decoded  = tok.batch_decode(filtered, skip_special_tokens=True)

    # 2 · map predictions back to tasks / attempts ---------------------
    total_preds = len(decoded)
    num_tasks   = total_preds // NUM_EVAL_ATTEMPTS

    assert num_tasks == len(trainer.eval_dataset)

    raw_ds      = trainer.eval_dataset

    if isinstance(raw_ds, dict):           # col-dict → row-dict list
        n = len(next(iter(raw_ds.values())))
        raw_ds = [{k: v[i] for k, v in raw_ds.items()} for i in range(n)]

    # ---------- evaluate each task ------------------------------------
    scorer   = GridScorer()
    executor = ProgramExecutor(timeout=0.5, executor_type="unrestricted")

    results = []
    for task_idx in range(num_tasks):
        ex = raw_ds[task_idx]

        start = task_idx * NUM_EVAL_ATTEMPTS
        assert preds[start:start+NUM_EVAL_ATTEMPTS].shape[0] == NUM_EVAL_ATTEMPTS
        
        task_decoded = decoded[start : start + NUM_EVAL_ATTEMPTS]

        attempt_details = _evaluate_attempts(
            task_idx, ex, task_decoded, scorer, executor, debug
        )
        results.append({"task_data": _build_task_data(task_idx, ex),
                        "attempt_details": attempt_details})

        # ── quick inspection block ────────────────────────────────────────────
        if debug:
            print(f"\n[INSPECT] task {task_idx}")
            for att in attempt_details:
                print("  train_acc:",
                      f"{att['train_accuracy']:.2f}",
                      "pred:", att["test_predicted"],
                      "transductive:", att["is_transductive"])

    # ---------- aggregate metrics -------------------------------------
    metrics = calculate_task_metrics(results, max_tokens=MAX_NEW_TOKENS)
    tot     = max(1, metrics["total"])
    tresp   = max(1, metrics["total_responses"])

    return {
        "all_test_correct":      metrics["all_test_correct"]   / tot,
        "all_train_correct":     metrics["all_train_correct"]  / tot,
        "min1_train_correct":    metrics["min1_train_correct"] / tot,
        "min1_code_success":     metrics["min1_code_success"] / tot,
        "weighted_voting_pass2": metrics.get("weighted_pass2", 0) / tot,
        "train_majority_pass2":  metrics.get("train_majority_pass2", 0) / tot,
        "max_length_rate":       metrics["max_length_responses"] / tresp,
        "timeout_rate":          metrics["timeout_responses"]   / tresp,
        "total_tasks":           tot,
        "total_responses":       tresp,
        "total_attempts":        tot * NUM_EVAL_ATTEMPTS,
    }

# ---------- helpers broken out for clarity ---------------------------
def _build_task_data(idx, ex):
    return {
        "task_id": ex.get("task_id", f"task_{idx}"),
        "train": [{"input": i, "output": o} for i, o in zip(ex["train_input"], ex["train_output"])],
        "test":  [{"input": i, "output": o} for i, o in zip(ex["test_input"],  ex["test_output" ])],
    }

def _evaluate_attempts(task_idx, ex, decoded_list, scorer, executor, debug):
    attempts = []
    for attempt_idx, gen_text in enumerate(decoded_list):
        attempt = _score_single_attempt(
            task_idx, attempt_idx, gen_text, ex, scorer, executor, debug
        )
        attempts.append(attempt)
        
        if debug:
            _debug_attempt(task_idx, attempt_idx, ex, attempt, scorer)

        if debug and attempt_idx == 0:
            print(f"[TASK {task_idx}] first attempt preview:\n{gen_text[:200]}\n")
    return attempts

def _score_single_attempt(task_idx, attempt_idx, gen_text, ex, scorer, executor, debug):
    code   = extract_python_code(gen_text)
    if debug:
        print("[CODE-DBG] extracted:", "None" if not code else code[:120])

    has_py = bool(code and code.strip())

    train_results, train_ok = _run_cases(
        ex["train_input"], ex["train_output"], code, has_py, scorer, executor
    )
    test_results,  test_ok = _run_cases(
        ex["test_input"],  ex["test_output"],  code, has_py, scorer, executor
    )

    # DEBUG: Check what's happening with train_results
    if len(train_results) == 0:
      print(f"⚠️  ZERO DIVISION DEBUG for task {task_idx}, attempt {attempt_idx}:")
      print(f"  ex keys: {list(ex.keys())}")
      print(f"  train_input type: {type(ex.get('train_input', 'MISSING'))}")
      print(f"  train_input length: {len(ex.get('train_input', []))}")
      print(f"  train_output type: {type(ex.get('train_output', 'MISSING'))}")
      print(f"  train_output length: {len(ex.get('train_output', []))}")
      print(f"  has_py: {has_py}")
      print(f"  code: {repr(code[:100] if code else 'None')}")

    any_timeout = any(r["timeout"] for r in train_results + test_results)
    code_ran    = any(r["error"] == "" for r in train_results) if has_py else False

    is_cheat = False
    if has_py:
        from utils.transduction import is_transduction_cheating
        is_cheat, _ = is_transduction_cheating(code, _build_task_data(task_idx, ex))

    if len(test_results) == 1:
        test_predicted = test_results[0]["prediction"]   # raw grid
    else:
        test_predicted = tuple(r["prediction"] for r in test_results)
    
    return dict(
        task_id           = ex.get("task_id", f"task_{task_idx}"),
        attempt           = attempt_idx,
        program           = code or "",
        program_extracted = has_py,
        code_ran          = code_ran,
        train_results     = train_results,
        test_results      = test_results,
        test_predicted    = test_predicted,          # ← comma here
        all_train_correct = train_ok,
        all_test_correct  = test_ok,
        any_timeout       = any_timeout,
        max_length        = len(gen_text) >= MAX_NEW_TOKENS - 10,
        response_text     = gen_text[:200] + ("…" if len(gen_text) > 200 else ""),
        train_accuracy    = sum(r["correct"] for r in train_results) / len(train_results),
        is_transductive   = is_cheat,
    )


def _run_cases(inputs, outputs, code, has_py, scorer, executor):
  results = []
  correct = 0

  # DEBUG: Check inputs
  if len(inputs) == 0:
      print(f"⚠️  _run_cases got EMPTY inputs! inputs={inputs}, outputs={outputs}")
      return results, correct == len(inputs)

  for inp, gold in zip(inputs, outputs):
      if has_py:
          pred, err, to = executor.execute_program_with_timeout(code, inp)
      else:
          pred, err, to = None, "no program", False

      # Normalize prediction format before scoring
      if pred is not None and err == "":
          try:
              # Convert single number to 1x1 grid
              if isinstance(pred, (int, float)):
                  pred = [[int(pred)]]
              # Convert 1D list to single row grid  
              elif isinstance(pred, list) and pred and isinstance(pred[0], (int, float)):
                  pred = [pred]  # [[1, 2, 3]] becomes [[[1, 2, 3]]]
              # Ensure it's a proper 2D list
              elif not isinstance(pred, list) or not pred or not isinstance(pred[0], list):
                  pred = None
                  err = "invalid_output_format"

              if pred is not None:
                  score_result = scorer.score_grid(pred, gold)
                  ok = score_result["correct"]
              else:
                  ok = False
          except Exception as e:
              pred = None
              err = f"scoring_error: {str(e)}"
              ok = False
      else:
          ok = False

      correct += ok
      results.append(dict(correct=ok, prediction=pred, error=err, timeout=to))

  return results, correct == len(inputs)


In [None]:
# Update the trainer configuration to use the fixed custom metrics
from trl import SFTTrainer, SFTConfig

# make sure the attr exists so Unsloth can safely delete it
setattr(model, "_flag_for_generation", True)

trainer = VllmSFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size = batch_size_global,
        per_device_eval_batch_size = len(data["validation"]),
        # per_device_eval_batch_size = batch_size_global,
        gradient_accumulation_steps=int(32 / batch_size_global),
        warmup_steps=5,
        eval_steps=0.1,
        do_eval=True,
        eval_strategy="steps",
        num_train_epochs=2,
        # max_steps=3,
        learning_rate=1e-4,
        logging_steps=0.05,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="constant",
        seed=3407,
        report_to="tensorboard",
        logging_dir=f"./logs/{run_name}",
        remove_unused_columns=True,
        save_strategy="steps",
        save_steps=0.2,
        save_total_limit=3,
        prediction_loss_only=False
    ),
    compute_metrics=compute_arc_metrics,
    # callbacks=[VllmMemoryCallback()], # best not to mess with vllm
)

### Evaluation Metrics and Functions Testing
Now that the trainer is set up, let's test all evaluation components to ensure they work correctly with the actual training setup.


In [None]:
from transformers.trainer_utils import EvalPrediction

# Import existing tests from utils/tests instead of duplicating logic
# Note: Assumes we're running from the llm_python directory structure

# Test Configuration
print("🧪 TESTING EVALUATION METRICS AND FUNCTIONS")
print("=" * 60)

def run_existing_tests():
    """Run existing tests from utils/tests to validate core functionality"""
    print("\n📋 Running existing tests from utils/tests...")
    
    test_results = {}
    
    # Test extract_python_code using existing tests
    print("\n🐍 Testing extract_python_code (from utils/tests)...")
    try:
        from utils.tests.test_prompt_utils import TestPromptUtils
        import unittest
        
        # Create test suite and run extract_python_code tests
        suite = unittest.TestLoader().loadTestsFromTestCase(TestPromptUtils)
        # Suppress test output by redirecting to null device
        import os
        result = unittest.TextTestRunner(stream=open(os.devnull, 'w')).run(suite)
        
        if result.wasSuccessful():
            print("  ✅ extract_python_code tests - PASSED")
            test_results['extract_python_code'] = True
        else:
            print("  ❌ extract_python_code tests - FAILED")
            print(f"     Failures: {len(result.failures)}, Errors: {len(result.errors)}")
            test_results['extract_python_code'] = False
            
    except Exception as e:
        print(f"  ❌ extract_python_code tests - ERROR: {e}")
        test_results['extract_python_code'] = False
    
    # Test calculate_task_metrics using existing tests
    print("\n📈 Testing calculate_task_metrics (from utils/tests)...")
    try:
        from utils.tests.test_metrics_voting_integration import TestMetricsVotingIntegration
        
        # Run a specific test to validate metrics calculation
        test_instance = TestMetricsVotingIntegration()
        test_instance.test_single_test_case_voting_success()
        test_instance.test_edge_cases()
        
        print("  ✅ calculate_task_metrics tests - PASSED")
        test_results['calculate_task_metrics'] = True
        
    except Exception as e:
        print(f"  ❌ calculate_task_metrics tests - ERROR: {e}")
        test_results['calculate_task_metrics'] = False
    
    # Test basic grid scoring logic (simple inline test since no dedicated test exists)
    print("\n📊 Testing GridScorer (basic validation)...")
    try:
        scorer = GridScorer()
        
        # Test perfect match
        result1 = scorer.score_grid([[1, 2], [3, 4]], [[1, 2], [3, 4]])
        assert result1["correct"] == True, "Perfect match should be correct"
        
        # Test mismatch
        result2 = scorer.score_grid([[1, 2], [3, 4]], [[1, 2], [3, 5]])
        assert result2["correct"] == False, "Mismatch should be incorrect"
        
        print("  ✅ GridScorer basic tests - PASSED")
        test_results['grid_scorer'] = True
        
    except Exception as e:
        print(f"  ❌ GridScorer basic tests - ERROR: {e}")
        test_results['grid_scorer'] = False
    
    # Test basic program execution (simple inline test)
    print("\n⚙️ Testing ProgramExecutor (basic validation)...")
    try:
        executor = ProgramExecutor(timeout=1.0, executor_type="unrestricted")
        
        # Test working code
        working_code = "def transform(grid): return [[c+1 for c in r] for r in grid]"
        result, error, timed_out = executor.execute_program_with_timeout(working_code, [[1, 2]])
        assert result == [[2, 3]], f"Expected [[2, 3]], got {result}"
        
        # Test broken code
        broken_code = "def transform(grid): return grid[999]"  # Index error
        result, error, timed_out = executor.execute_program_with_timeout(broken_code, [[1, 2]])
        assert result is None, "Broken code should return None"
        
        print("  ✅ ProgramExecutor basic tests - PASSED")
        test_results['program_executor'] = True
        
    except Exception as e:
        print(f"  ❌ ProgramExecutor basic tests - ERROR: {e}")
        test_results['program_executor'] = False
    
    return test_results

# Run the existing tests
existing_test_results = run_existing_tests()

In [None]:
print("metrics fn:", trainer.compute_metrics)
print("prediction_loss_only:", trainer.args.prediction_loss_only)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# We should consider training on completions only!!! which means the response part for the xentropy.

In [None]:
from unsloth.chat_templates import train_on_responses_only # or run the code above if not using unsloth

# TO SUPPORT REASONING, WE NEED TO DYNAMICALLY APPLY THE RIGHT MASKING, NOT YET IMPLEMENTED
# masks everything between the instruction_part and response_part
trainer = train_on_responses_only(
    trainer,
    instruction_part = instruction_tag,
    response_part = response_tag,
    # force_match=False # comment out to set true for a cleaner masking
)

In [None]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

In [None]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[0]["labels"]]).replace(tokenizer.pad_token, " ")

In [None]:
# test_toks=tokenizer.encode('test')
# print(test_toks)

# messages = [{"role": "user", "content": "test"}]
# m_toks = tokenizer.apply_chat_template(messages, tokenize=False)
# print(m_toks)

# m_tokenized = tokenizer.encode(m_toks)

# print("\nDecoded")
# prompt = tokenizer.decode(m_tokenized, skip_special_tokens=False).lstrip()
# print(prompt)

# print(tokenizer.chat_template)

# print(tokenizer.bos_token_id)

In [None]:
  # # # Pick a task ID that was failing and load it directly
  # # task_id = "007bbfb7"  # One of the ones from your error logs
  # # task_data = task_loader.load_task(task_id, dataset="arc-agi-1")
  # # print(f"Direct TaskLoader for {task_id}:")
  # # print(f"  Training examples: {len(task_data['train'])}")
  # # print(f"  Test examples: {len(task_data['test'])}")
  # # print(f"  First train input shape: {np.array(task_data['train'][0]['input']).shape if task_data['train'] else 'NO TRAIN DATA'}")

  # # Look at a few examples from the validation dataset
  # val_ds = data["validation"]
  # print("Validation dataset length:", len(val_ds))
  # print("\nFirst example keys:", list(val_ds[0].keys()))
  # print("\nFirst few examples:")
  # for i in range(min(3, len(val_ds))):
  #     ex = val_ds[i]
  #     print(f"\nExample {i}:")
  #     print(f"  task_id: {ex.get('task_id', 'MISSING')}")
  #     print(f"  train_input length: {len(ex.get('train_input', []))}")
  #     print(f"  train_output length: {len(ex.get('train_output', []))}")
  #     print(f"  test_input length: {len(ex.get('test_input', []))}")
  #     print(f"  test_output length: {len(ex.get('test_output', []))}")

  # # Test create_chat_messages on a single example from the HF dataset
  # from datasets import load_dataset
  # raw_ds = load_dataset("Trelis/grids_only_arc-agi-1_shortest_evaluation_10_20250806_201429", split="train")
  # print("Raw dataset example keys:", list(raw_ds[0].keys()))
  # print("Raw dataset task_id:", raw_ds[0].get("task_id"))

  # # Try the create_chat_messages function on just one example
  # single_result = create_chat_messages(raw_ds[0])
  # if single_result:
  #     print("Processed example train_input length:", len(single_result.get("train_input", [])))
  # else:
  #     print("create_chat_messages returned None!")

In [None]:
metrics = trainer.evaluate(metric_key_prefix="dev")
print(metrics)

In [None]:
print("eval batch size:", trainer.args.eval_batch_size)
print("dataset length :", len(trainer.eval_dataset))

In [None]:
# # idx = 0
# # one_prompt = prompts[idx]
# # print(one_prompt)

# idx=1

# one_prompt = """<|im_start|>system
# You are an AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks by reasoning and generating Python code.<|im_end|>
# <|im_start|>user
# You are an AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks by generating Python code.
# Your goal is to analyze input-output grid pairs. The outputs were produced by applying a transformation rule to the inputs. Implement the transformation rules as a Python function.
# You should only write the implemented the transformation in code.
# You must write code in triple backticks (```python and then ```). You must write a function called 'transform' which takes a single argument, the input grid as 'list[list[int]]', and returns the transformed grid (also as 'list[list[int]]').
# You should make sure that you implement a version of the transformation which works in general (at least for all given input-output pairs and test input pairs).
# The number in the input grid can be mapped to the following colors: 0:Black; 1:Blue; 2:Red; 3:Green; 4:Yellow; 5:Grey; 6:Pink; 7:Orange; 8:Purple; 9:Brown
# Now, solve the following ARC-AGI task:
# # Task to solve:
# ## Input 1 (grid shape: 5 by 5):
# [[1 1 0 1 1] [1 0 0 0 1] [0 0 0 0 0] [0 1 0 2 2] [1 1 0 2 2]]
# ## Output 1 (grid shape: 2 by 2):
# [[1 0] [1 1]]

# ## Input 2 (grid shape: 5 by 5):
# [[1 0 0 1 1] [1 1 0 1 0] [0 0 0 0 0] [1 1 0 2 2] [0 1 0 2 2]]
# ## Output 2 (grid shape: 2 by 2):
# [[0 1] [1 1]]

# ## Input 3 (grid shape: 5 by 5):
# [[1 1 0 0 1] [0 0 0 0 1] [0 0 0 0 0] [0 0 0 2 2] [1 1 0 2 2]]
# ## Output 3 (grid shape: 2 by 2):
# [[1 0] [1 0]]

# ## Test Input 1 (grid shape: 5 by 5):
# [[1 1 0 0 1] [0 1 0 1 1] [0 0 0 0 0] [1 0 0 2 2] [1 1 0 2 2]]
# ## Expected Test Output 1 (grid shape: 2 by 2):
# [[1 1] [1 0]]
# <|im_end|>
# <|im_start|>assistant"""

# executor = ProgramExecutor(timeout=0.5, executor_type="unrestricted")
# scorer   = GridScorer()

# out = model.fast_generate([one_prompt], sampling_params=SAMPLING_PARAMS)
# answer = tokenizer.decode(out[0].outputs[0].token_ids, skip_special_tokens=True)
# print(answer)

# code = extract_python_code(answer)
# print("code extracted:", bool(code))

# if code:
#     inp  = trainer.eval_dataset[idx]["train_input"][0]
#     gold = trainer.eval_dataset[idx]["train_output"][0]

#     pred, err, _ = executor.execute_program_with_timeout(code, inp)
#     print("pred:", pred)
#     print("gold:", gold)
#     print("match:", scorer.score_grid(pred, gold)["correct"])
#     if err:
#         print("runtime error:", err)

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
print(trainer_stats)

In [None]:
# Check a few examples
for i in range(2):
    example = data["train"][i]
    print(f"\nExample {i} text length: {len(example['text'])}")
    print(f"Last 200 chars: {example['text'][-200:]}")

<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

In [None]:
# Legacy inference testing example - replaced by actual test in cell 61

In [None]:
# print(tokenizer.chat_template)

In [None]:
data["validation"]['prompt'][0]

In [None]:
messages = [
    {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
    {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10\u00d710 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

# from transformers import TextStreamer
# _ = model.generate(
#     **tokenizer(text, return_tensors = "pt").to("cuda"),
#     max_new_tokens = 8000, # Increase for longer outputs!
#     # temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
#     temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
#     # temperature = 0.01,
#     streamer = TextStreamer(tokenizer, skip_prompt = True),
# )

# text = data["validation"]['prompt'][0]

from transformers import TextStreamer

inputs = tokenizer(text, return_tensors="pt").to("cuda")
input_ids = inputs["input_ids"]  # Extract for convenience

output_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
    # temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
    # temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    temperature=0.1, # BEST FOR SINGLE ATTEMPTS
)

# Slice to skip the prompt portion in output
generated_tokens = output_ids[0][input_ids.shape[-1]:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print(generated_text)

In [None]:
# Use extract_python_code from utils (SOAR approach)
code = extract_python_code(generated_text)

if code:
    print(code)
    exec(code, globals())  # Defines `transform()` in global scope
else:
    raise ValueError("Could not extract Python code from generated text")

In [None]:
# Alternative transform implementations commented out - using model generated version above


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# -------------------- helper --------------------
def safe_transform(grid):
    grid = grid.copy()        # <‑‑ clone so the original stays unchanged
    try:
        return transform(grid)
    except Exception as err:
        print(f"[safe_transform] transform() failed – {err}")
        return np.zeros_like(grid)

# -------------------- test case -----------------
test_case = {
    "input": np.array([  # convert to np.array for convenience
        [5, 0, 5, 5, 0, 0, 5, 0, 5, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 5]
    ]),
    "output": np.array([
        [5, 0, 5, 5, 0, 0, 5, 0, 5, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
        [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [2, 0, 2, 2, 0, 0, 2, 0, 2, 5]
    ])
}

# -------------------- run & plot ----------------
predicted_output = safe_transform(test_case["input"])

fig, axs = plt.subplots(1, 3, figsize=(12, 4))
titles = ["Input", "Predicted Output", "Ground Truth Output"]
grids  = [test_case["input"], predicted_output, test_case["output"]]

for ax, grid, title in zip(axs, grids, titles):
    im = ax.imshow(grid, cmap="viridis", vmin=0, vmax=5)
    ax.set_title(title)
    ax.axis("off")

plt.tight_layout()
plt.show()

In [None]:
# Inference testing section ends here

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
# !pip show transformers

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

You can use this also to load a checkpoint!!! i.e. an intermediate checkpoint from training, so you can then push it to hub.

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
import os
import unsloth
from unsloth import FastLanguageModel
import torch

if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        # model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        model_name = "trainer_output/checkpoint-604",
        # max_seq_length = 30000,
        load_in_4bit = False,
    )
    run_name = "Qwen3-4B_dsarc-programs-correct-50_20250806-233716-c604"

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
from huggingface_hub import HfFolder, login

# Call this at the top of your script / notebook
if HfFolder.get_token() is None:   # no token cached or in $HF_TOKEN
    login()                        # interactive prompt

In [None]:
# print(model)

In [None]:
# run_name = "Qwen2.5-Coder-7B-Instruct-gemini_synth_50_random_split_1_training-20250723-113848"
print(f"Pushing to Trelis/{run_name}")

In [None]:
model = model.merge_and_unload()
model.push_to_hub(f"Trelis/{run_name}")
tokenizer.push_to_hub(f"Trelis/{run_name}")

In [None]:
# # Merge to 16bit
# if False:
#     model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# if True: # Pushing to HF Hub
#     model.push_to_hub_merged(f"Trelis/{run_name}", tokenizer, save_method = "merged_16bit")

# # Merge to 4bit
# if False:
#     model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
# if False: # Pushing to HF Hub
#     model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# # Just LoRA adapters
# if False:
#     model.save_pretrained("model")
#     tokenizer.save_pretrained("model")
# if False: # Pushing to HF Hub
#     model.push_to_hub("hf/model", token = "")
#     tokenizer.push_to_hub("hf/model", token = "")


In [None]:
# print(tokenizer.chat_template)