# Qwen3 Tuning
for ARC AGI 2

In [1]:
# ---------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------
test_run = False

model_slug = "Qwen/Qwen3-4B"
# model_slug = "julien31/Soar-qwen-7b"
# model_slug = "Qwen/Qwen2.5-Coder-7B-Instruct"
# model_slug = "Qwen/Qwen3-30B-A3B"

model_max_length = 32768 # max eval set length is around 19,400 tokens. Add some headroom for responses and arc-agi-2 test lengths. This doesn't matter a whole lot because the rows of data are padded to max length.
lora_rank = 128

# Training AND validation batch size (incl. for autoregressive train/test example metrics calculations)
batch_size_global = 4 # Can use 32 for Qwen3-4b

# Set max_rows flag to limit train size. None for all
if test_run:
    max_rows = 32*4
else:
    max_rows = None # None for all rows

# Set the training dataset
train_slug = "Trelis/arc-agi-2-perfect-50"

# Thinking for qwen models
enable_thinking = False # required for training a qwen base model.

In [2]:
import scipy

In [3]:
# !git pull

In [4]:
## Now automatically set via Runpod Secrets
# !git config --global user.name ‚ÄúRonanMcGovern‚Äù
# !git config --global user.email "78278410+RonanKMcGovern@users.noreply.github.com"

### Installation

In [5]:
# !uv pip install flash_attn --no-build-isolation -qU --system
# !uv pip uninstall flash_attn --system

!uv pip install unsloth -qU

#To run with vllm.
# !uv pip install vllm -qU --system

In [6]:
# INSTALLED IN THE CONTAINER IF USING the [arc-agi-2025 container on runpod](https://console.runpod.io/deploy?template=bh0rvngapk&ref=jmfkcdio)
# %%capture
# import os
# !pip install uv -qU
# !uv pip install unsloth matplotlib tensorboard -qU --system
# !export HF_HUB_ENABLE_HF_TRANSFER=1

In [7]:
# # # if you face model download issues
# import os
# os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

In [8]:
from huggingface_hub import HfFolder, login

# Call this at the top of your script / notebook
if HfFolder.get_token() is None:   # no token cached or in $HF_TOKEN
    login()                        # interactive prompt

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from IPython import get_ipython
ip = get_ipython()

ip.system_raw(
    "tensorboard "
    "--logdir ./logs "
    "--port 6006 "
    "--bind_all "
    "--reload_interval 5 "
    "> tb.out 2>&1 &"
)

print("TensorBoard is now running in the background on port 6006.")

TensorBoard is now running in the background on port 6006.


### Unsloth

In [10]:
!nvidia-smi

Wed Aug 20 10:33:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H200                    On  |   00000000:18:00.0 Off |                    0 |
| N/A   24C    P0             75W /  700W |       1MiB / 143771MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
import os
os.environ["HF_HOME"] = "/workspace"
os.environ["HF_HUB_CACHE"] = "/workspace/hub" # (recommended) override just the repo cache
print(os.environ["HF_HOME"])

import unsloth
import os
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_slug,
    max_seq_length = model_max_length,   # Context length - can be longer, but uses more memory
    load_in_4bit = False,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # cache_dir = '/workspace',
    # token = "hf_...",      # use one if using gated models
    
    # for using fast_inference
    fast_inference = False, # removing this as it takes up VRAM
    max_lora_rank=lora_rank,
    # gpu_memory_utilization=0.3, # not needed if not using vLLM for inference
)

/workspace
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


==((====))==  Unsloth 2025.8.8: Fast Qwen3 patching. Transformers: 4.55.2.
   \\   /|    NVIDIA H200. Num GPUs = 1. Max memory: 139.719 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.31it/s]


In [12]:
!nvidia-smi

Wed Aug 20 10:38:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H200                    On  |   00000000:18:00.0 Off |                    0 |
| N/A   26C    P0            112W /  700W |    8390MiB / 143771MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
print(model.max_seq_length)

32768


In [14]:
# # Print a summary of the transformer layers and key dimensions
# for i, block in enumerate(model.model.layers):
#     attn = block.self_attn
#     mlp = block.mlp

#     print(f"Layer {i}:")
#     print(f"  Attention:")
#     print(f"    q_proj: {attn.q_proj.weight.shape}")
#     print(f"    k_proj: {attn.k_proj.weight.shape}")
#     print(f"    v_proj: {attn.v_proj.weight.shape}")
#     print(f"    out_proj: {attn.o_proj.weight.shape}")
#     print(f"  MLP:")
#     print(f"    fc1: {mlp.gate_proj.weight.shape}")
#     print(f"    fc2: {mlp.up_proj.weight.shape}")
#     print(f"    fc3: {mlp.down_proj.weight.shape}")
#     print()

In [15]:
# print(model)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [16]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128. could consider 128.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                     ],
    lora_alpha = 64,  # Best to choose alpha = rank or rank*2. EXCEPT if using rslora, in which case set it as sqrt(max matrix dimension). 64 is good for Qwen 4B
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    # use_gradient_checkpointing = False, # Hard to know if this really turns it off.
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    # init_lora_weights="pissa",          # <- enable PiSSA here. Not allowed by unsloth! Can add later when going multi-gpu. Also, actually performs worse than rslora...
)

Unsloth 2025.8.8 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [17]:
print(tokenizer.padding_side)

right


In [18]:
# Import utils using standard project root detection
from pathlib import Path
import sys

# Find project root by looking for pyproject.toml
project_root = next(
  (parent for parent in [Path.cwd()] + list(Path.cwd().parents)
   if (parent / "pyproject.toml").exists()),
  Path.cwd()
)

# Add project root to path for consistent imports
sys.path.insert(0, str(project_root))

print(f"üìÅ Project root: {project_root}")

# Import from llm_python with consistent root-level imports
from llm_python.utils.task_loader import TaskLoader
from llm_python.utils.scoring import GridScorer
from llm_python.utils.arc_tester import ArcTester
from llm_python.utils.prompt_utils import create_arc_prompt, extract_python_code
from llm_python.utils.metrics_utils import calculate_task_metrics, format_metrics_display, metrics_to_percentages
from llm_python.utils.timeout_utils import execute_with_timeout
from llm_python.utils.prompt_loader import PromptLoader

# Initialize utility instances
prompt_loader = PromptLoader()
scorer = GridScorer()
print("‚úÖ Utils imported and initialized successfully")


üìÅ Project root: /workspace/arc-agi-2025
‚úÖ Utils imported and initialized successfully


<a name="Data"></a>
### Data Prep

In [19]:
import re

def clean_multiple_newlines(code: str) -> str:
    """Remove multiple consecutive newlines and replace with at most one empty line."""
    # Pattern to match multiple consecutive newlines with optional whitespace
    # This handles cases like \n\n\n, \n  \n\n, \n\t\n\n\n etc.
    pattern = r'\n(\s*\n)+'
    # Replace with at most one empty line (two newlines)
    cleaned = re.sub(pattern, '\n\n', code)
    return cleaned

def count_tokens(text: str, tokenizer) -> int:
    """Count tokens in text using the provided tokenizer."""
    return len(tokenizer.encode(text))

def should_filter_code(code: str, tokenizer, max_tokens: int = 1000) -> bool:
    """Check if code should be filtered based on token count."""
    return count_tokens(code, tokenizer) > max_tokens

print("‚úÖ Added code cleaning and filtering functions")


‚úÖ Added code cleaning and filtering functions


In [20]:
# Test cases
test_code = """def solve(grid):
  # First comment


  # Second comment after multiple empty lines
  rows = len(grid)
  cols = len(grid[0])
  
  # Another comment
    
    
  return grid"""

print("ORIGINAL CODE:")
print(repr(test_code))
print("\nORIGINAL CODE (formatted):")
print(test_code)

cleaned = clean_multiple_newlines(test_code)
print("\n" + "="*50)
print("CLEANED CODE:")
print(repr(cleaned))
print("\nCLEANED CODE (formatted):")
print(cleaned)

print("\n" + "="*50)
print("CHANGES SUMMARY:")
print(f"Original length: {len(test_code)} chars")
print(f"Cleaned length: {len(cleaned)} chars")
print(f"Characters removed: {len(test_code) - len(cleaned)}")

ORIGINAL CODE:
'def solve(grid):\n  # First comment\n\n\n  # Second comment after multiple empty lines\n  rows = len(grid)\n  cols = len(grid[0])\n\n  # Another comment\n\n\n  return grid'

ORIGINAL CODE (formatted):
def solve(grid):
  # First comment


  # Second comment after multiple empty lines
  rows = len(grid)
  cols = len(grid[0])

  # Another comment


  return grid

CLEANED CODE:
'def solve(grid):\n  # First comment\n\n  # Second comment after multiple empty lines\n  rows = len(grid)\n  cols = len(grid[0])\n\n  # Another comment\n\n  return grid'

CLEANED CODE (formatted):
def solve(grid):
  # First comment

  # Second comment after multiple empty lines
  rows = len(grid)
  cols = len(grid[0])

  # Another comment

  return grid

CHANGES SUMMARY:
Original length: 160 chars
Cleaned length: 158 chars
Characters removed: 2


In [21]:
from pathlib import Path
import json
from typing import Optional
from datasets import load_dataset, DatasetDict

# ---------------------------------------------------------------------
# Prompt management using utils (replacing hard-coded prompts)
# ---------------------------------------------------------------------

# Use prompt_loader to get SOAR prompts from utils
SYSTEM_PROMPT = prompt_loader.get_system_message("soar")
INITIAL_TURN_PROMPT = prompt_loader.get_initial_turn_prompt("soar")

print(f"‚úÖ Using SOAR prompts from utils:")
print(f"   System prompt: {len(SYSTEM_PROMPT)} chars")
print(f"   Initial turn prompt: {len(INITIAL_TURN_PROMPT)} chars")

def hf_train_dataset_to_chat_dataset(dataset_slug, split="train", max_rows=None):
  """
  Faster path:
    1) Server-side slice to avoid downloading full split.
    2) Pre-filter cheap/invalid rows BEFORE expensive prompt/tokenizer work.
    3) Map to build chat fields.
  """
  effective_split = f"{split}[:{max_rows}]" if max_rows else split
  ds_raw = load_dataset(dataset_slug, split=effective_split)

  from llm_python.utils.task_loader import TaskLoader

  # Create a single TaskLoader instance to reuse
  task_loader = TaskLoader()

  # ---- Pre-filter: keep only rows with valid grids and acceptable code length
  def keep_example(ex):
      # Guard: task must exist
      try:
          task_loader.get_task(ex["task_id"])
      except FileNotFoundError:
          return False

      # Guard: code length after cleaning
      cleaned = clean_multiple_newlines(ex["code"])
      if should_filter_code(cleaned, tokenizer, max_tokens=1000):
          return False

      return True

  ds_kept = ds_raw.filter(keep_example, desc=f"pre-filter ({effective_split})", load_from_cache_file=False)

  # ---- Build chat fields
  def to_chat(example):
      task_id = example["task_id"]
      task_data = task_loader.get_task(task_id)

      original_code = example["code"]
      cleaned_code = clean_multiple_newlines(original_code)
      cleaned_flag = int(cleaned_code != original_code)

      # Use predicted outputs if present; else fall back to ground-truth grids
      train_outputs = example.get(
          "predicted_train_output",
          [ex["output"] for ex in task_data["train"]],
      )
      test_outputs = example.get(
          "predicted_test_output",
          [ex["output"] for ex in task_data["test"]],
      )

      task_data_for_prompt = {
          "train": [
              {"input": ex["input"], "output": out}
              for ex, out in zip(task_data["train"], train_outputs)
          ],
          "test": [
              {"input": ex["input"], "output": out}
              for ex, out in zip(task_data["test"], test_outputs)
          ],
      }

      # Use create_arc_prompt from utils
      system_content, user_content = create_arc_prompt(task_data_for_prompt, prompt_loader, "soar", include_test_outputs=True)

      messages = [
          {"role": "system", "content": system_content},
          {"role": "user", "content": user_content},
          {"role": "assistant", "content": f"```python\n{cleaned_code}\n```"},
      ]

      # Apply chat template
      text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
      prompt_text = tokenizer.apply_chat_template(
          messages[:-1],
          tokenize=False,
          add_generation_prompt=True,
          enable_thinking=enable_thinking,
      )

      return {
          "messages": messages,
          "text": text,
          "prompt": prompt_text,
          "train_input": [ex["input"] for ex in task_data_for_prompt["train"]],
          "train_output": train_outputs,
          "test_input": [ex["input"] for ex in task_data_for_prompt["test"]],
          "test_output": test_outputs,
          "task_id": task_id,
          "cleaned_newlines": cleaned_flag,  # for stats
      }

  ds = ds_kept.map(to_chat, desc="build train chat fields", load_from_cache_file=False)

  # ---- Stats (robust; no reliance on closures/caching)
  total_raw = ds_raw.num_rows
  kept = ds_kept.num_rows
  retained = ds.num_rows
  cleaned_count = sum(ds["cleaned_newlines"]) if "cleaned_newlines" in ds.column_names else 0

  print(f"\nüìä Training data cleaning statistics:")
  print(f"   Total examples (raw slice): {total_raw}")
  print(f"   Removed in pre-filter: {total_raw - kept}")
  print(f"   Examples retained: {retained}")
  print(f"   Examples with cleaned newlines (retained): {cleaned_count}")

  # Optionally drop the helper stats column:
  # ds = ds.remove_columns(["cleaned_newlines"])

  return ds

def build_dataset(train_slug: str) -> DatasetDict:
    train_ds = hf_train_dataset_to_chat_dataset(train_slug, split="train", max_rows=max_rows)
    return DatasetDict(train=train_ds)


# ---------------------------------------------------------------------
# Build the dataset
# ---------------------------------------------------------------------
data = build_dataset(train_slug)


‚úÖ Using SOAR prompts from utils:
   System prompt: 129 chars
   Initial turn prompt: 990 chars
Loading arc-prize-2024...
  Training: 400 tasks
  Evaluation: 400 tasks
  Test: 100 tasks
Loading arc-prize-2025...
  Training: 1000 tasks
  Evaluation: 120 tasks
  Test: 240 tasks


pre-filter (train): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15519/15519 [00:08<00:00, 1842.55 examples/s]
build train chat fields: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15506/15506 [00:11<00:00, 1403.71 examples/s]


üìä Training data cleaning statistics:
   Total examples (raw slice): 15519
   Removed in pre-filter: 13
   Examples retained: 15506
   Examples with cleaned newlines (retained): 0





In [22]:
# print(data["train"][0]['text'])

Let's see the structure of the dataset:

In [23]:
import numpy as np
from statistics import median

def length_stats(dataset, name=""):
    """
    Return min / median / max tokenised length for a ü§ó Dataset split that has a
    single 'text' column. Uses the same tokenizer already in memory.
    """
    # Tokenise in batches ‚Üí list of list[int] ‚Üí list[int] lengths
    lengths = dataset.map(
        lambda batch: {
            "len": [len(ids) for ids in tokenizer(batch["text"],
                                                  add_special_tokens=False
                                                 )["input_ids"]]
        },
        batched=True,
        remove_columns=dataset.column_names,   # drop 'text'
        keep_in_memory=True,
    )["len"]

    print(f"{name:>11}:  min={min(lengths):>4}  "
          f"median={int(median(lengths)):>4}  max={max(lengths):>4}")

# ‚îÄ‚îÄ run for both splits ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
length_stats(data["train"],       "train")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15506/15506 [00:14<00:00, 1061.84 examples/s]

      train:  min= 492  median=1983  max=19060





### Pre-Training Data Integrity Tests
Before training, let's test the ground-truth code on a random sample of training examples to validate dataset quality and establish baseline performance.


In [24]:
import random

# Configuration for pre-training tests
NUM_TEST_EXAMPLES = 128  # Number of random examples to test
RANDOM_SEED = 42  # For reproducible results

def run_pre_training_data_integrity_tests(dataset_split="train", num_examples=NUM_TEST_EXAMPLES):
    """
    Test ground-truth code from dataset on random examples to validate data quality.
    
    Args:
        dataset_split: Which split to test (should be "train" since validation has no ground-truth code)
        num_examples: Number of random examples to test
    """
    print(f"üß™ Running Pre-Training Data Integrity Tests")
    print(f"üìä Testing {num_examples} random examples from {dataset_split} split")
    print("=" * 60)
    
    # Set seed for reproducible sampling
    random.seed(RANDOM_SEED)
    
    # Get the dataset split
    dataset = data[dataset_split]
    
    # Randomly sample examples
    total_examples = len(dataset)
    if num_examples > total_examples:
        print(f"‚ö†Ô∏è  Requested {num_examples} examples but only {total_examples} available. Testing all.")
        sample_indices = list(range(total_examples))
    else:
        sample_indices = random.sample(range(total_examples), num_examples)
    
    # Initialize tracking variables
    results = []
    executor = ArcTester(timeout=2.0, executor_type="unrestricted")
    
    print(f"\nüîç Testing {len(sample_indices)} examples...\n")
    
    for i, idx in enumerate(sample_indices):
        example = dataset[idx]
        task_id = example.get("task_id", f"idx_{idx}")
        code = example["code"]
        
        print(f"[{i+1}/{len(sample_indices)}] Testing {task_id}")
        
        # Initialize results for this example
        example_result = {
            "task_id": task_id,
            "index": idx,
            "code": code,
            "train_results": [],
            "test_results": [],
            "train_success": 0,
            "test_success": 0,
            "code_executed": False,
            "errors": []
        }
        
        # Test on training examples
        train_correct = 0
        for t_idx, (train_in, train_out) in enumerate(zip(example["train_input"], example["train_output"])):
            try:
                predicted_output, error, timed_out = executor.execute_program_with_timeout(code, train_in)
                
                if predicted_output is not None:
                    example_result["code_executed"] = True
                    score_result = scorer.score_grid(predicted_output, train_out)
                    is_correct = score_result["correct"]
                    
                    if is_correct:
                        train_correct += 1
                    
                    example_result["train_results"].append({
                        "index": t_idx,
                        "correct": is_correct,
                        "predicted": predicted_output,
                        "expected": train_out,
                        "timed_out": timed_out
                    })
                else:
                    example_result["train_results"].append({
                        "index": t_idx,
                        "correct": False,
                        "error": error,
                        "timed_out": timed_out
                    })
                    if error:
                        example_result["errors"].append(f"Train {t_idx}: {error}")
                        
            except Exception as e:
                example_result["train_results"].append({
                    "index": t_idx,
                    "correct": False,
                    "error": str(e)
                })
                example_result["errors"].append(f"Train {t_idx}: {str(e)}")
        
        # Test on test examples
        test_correct = 0
        for t_idx, (test_in, test_out) in enumerate(zip(example["test_input"], example["test_output"])):
            try:
                predicted_output, error, timed_out = executor.execute_program_with_timeout(code, test_in)
                
                if predicted_output is not None:
                    example_result["code_executed"] = True
                    score_result = scorer.score_grid(predicted_output, test_out)
                    is_correct = score_result["correct"]
                    
                    if is_correct:
                        test_correct += 1
                    
                    example_result["test_results"].append({
                        "index": t_idx,
                        "correct": is_correct,
                        "predicted": predicted_output,
                        "expected": test_out,
                        "timed_out": timed_out
                    })
                else:
                    example_result["test_results"].append({
                        "index": t_idx,
                        "correct": False,
                        "error": error,
                        "timed_out": timed_out
                    })
                    if error:
                        example_result["errors"].append(f"Test {t_idx}: {error}")
                        
            except Exception as e:
                example_result["test_results"].append({
                    "index": t_idx,
                    "correct": False,
                    "error": str(e)
                })
                example_result["errors"].append(f"Test {t_idx}: {str(e)}")
        
        # Calculate success rates for this example
        example_result["train_success"] = train_correct / len(example["train_input"]) if example["train_input"] else 0
        example_result["test_success"] = test_correct / len(example["test_input"]) if example["test_input"] else 0
        
        # Print summary for this example
        total_train = len(example["train_input"])
        total_test = len(example["test_input"])
        
        print(f"  ‚úÖ Train: {train_correct}/{total_train} ({example_result['train_success']:.1%})")
        print(f"  ‚úÖ Test:  {test_correct}/{total_test} ({example_result['test_success']:.1%})")
        
        if example_result["errors"]:
            print(f"  ‚ùå Errors: {len(example_result['errors'])}")
        if not example_result["code_executed"]:
            print(f"  ‚ö†Ô∏è  Code never executed successfully")
        print()
        
        results.append(example_result)
    
    return results

# Run the tests
data_integrity_results = run_pre_training_data_integrity_tests("train", NUM_TEST_EXAMPLES)

üß™ Running Pre-Training Data Integrity Tests
üìä Testing 128 random examples from train split

üîç Testing 128 examples...

[1/128] Testing b60334d2
  ‚úÖ Train: 0/2 (0.0%)
  ‚úÖ Test:  1/1 (100.0%)

[2/128] Testing 21f83797
  ‚úÖ Train: 0/2 (0.0%)
  ‚úÖ Test:  1/1 (100.0%)

[3/128] Testing 0962bcdd
  ‚úÖ Train: 0/2 (0.0%)
  ‚úÖ Test:  1/1 (100.0%)

[4/128] Testing d037b0a7
  ‚úÖ Train: 1/3 (33.3%)
  ‚úÖ Test:  1/1 (100.0%)

[5/128] Testing 4c5c2cf0
  ‚úÖ Train: 1/3 (33.3%)
  ‚úÖ Test:  1/1 (100.0%)

[6/128] Testing 44f52bb0
  ‚úÖ Train: 2/6 (33.3%)
  ‚úÖ Test:  0/2 (0.0%)

[7/128] Testing 40853293
  ‚úÖ Train: 2/2 (100.0%)
  ‚úÖ Test:  1/1 (100.0%)

[8/128] Testing 25d8a9c8
  ‚úÖ Train: 1/4 (25.0%)
  ‚úÖ Test:  1/1 (100.0%)

[9/128] Testing ce8d95cc
  ‚úÖ Train: 1/4 (25.0%)
  ‚úÖ Test:  1/1 (100.0%)

[10/128] Testing 1fad071e
  ‚úÖ Train: 1/3 (33.3%)
  ‚úÖ Test:  1/1 (100.0%)

[11/128] Testing bdad9b1f
  ‚úÖ Train: 0/2 (0.0%)
  ‚úÖ Test:  1/1 (100.0%)

[12/128] Testing d037b0a7
  

In [25]:
def analyze_data_integrity_results(results):
    """
    Analyze and display comprehensive statistics from the data integrity tests.
    """
    print("=" * 60)
    print("üìà PRE-TRAINING DATA INTEGRITY RESULTS ANALYSIS")
    print("=" * 60)
    
    if not results:
        print("‚ùå No results to analyze!")
        return
    
    # Overall statistics
    total_examples = len(results)
    examples_with_executable_code = sum(1 for r in results if r["code_executed"])
    examples_with_errors = sum(1 for r in results if r["errors"])
    
    # Training performance statistics
    train_success_rates = [r["train_success"] for r in results]
    perfect_train = sum(1 for rate in train_success_rates if rate == 1.0)
    partial_train = sum(1 for rate in train_success_rates if 0 < rate < 1.0)
    failed_train = sum(1 for rate in train_success_rates if rate == 0.0)
    
    # Test performance statistics  
    test_success_rates = [r["test_success"] for r in results]
    perfect_test = sum(1 for rate in test_success_rates if rate == 1.0)
    partial_test = sum(1 for rate in test_success_rates if 0 < rate < 1.0)
    failed_test = sum(1 for rate in test_success_rates if rate == 0.0)
    
    # Calculate overall metrics
    avg_train_success = sum(train_success_rates) / len(train_success_rates) if train_success_rates else 0
    avg_test_success = sum(test_success_rates) / len(test_success_rates) if test_success_rates else 0
    
    # Count total grids tested
    total_train_grids = sum(len(r["train_results"]) for r in results)
    total_test_grids = sum(len(r["test_results"]) for r in results)
    correct_train_grids = sum(sum(tr["correct"] for tr in r["train_results"]) for r in results)
    correct_test_grids = sum(sum(tr["correct"] for tr in r["test_results"]) for r in results)
    
    print(f"\nüéØ OVERALL PERFORMANCE:")
    print(f"   Examples tested: {total_examples}")
    print(f"   Code executable: {examples_with_executable_code}/{total_examples} ({examples_with_executable_code/total_examples:.1%})")
    print(f"   Examples with errors: {examples_with_errors}/{total_examples} ({examples_with_errors/total_examples:.1%})")
    
    print(f"\nüìä TRAINING GRIDS PERFORMANCE:")
    print(f"   Average success rate: {avg_train_success:.1%}")
    print(f"   Perfect examples (100%): {perfect_train}/{total_examples} ({perfect_train/total_examples:.1%})")
    print(f"   Partial examples (>0% <100%): {partial_train}/{total_examples} ({partial_train/total_examples:.1%})")
    print(f"   Failed examples (0%): {failed_train}/{total_examples} ({failed_train/total_examples:.1%})")
    print(f"   Grid-level accuracy: {correct_train_grids}/{total_train_grids} ({correct_train_grids/total_train_grids:.1%})")
    
    print(f"\nüéØ TEST GRIDS PERFORMANCE:")
    print(f"   Average success rate: {avg_test_success:.1%}")
    print(f"   Perfect examples (100%): {perfect_test}/{total_examples} ({perfect_test/total_examples:.1%})")
    print(f"   Partial examples (>0% <100%): {partial_test}/{total_examples} ({partial_test/total_examples:.1%})")
    print(f"   Failed examples (0%): {failed_test}/{total_examples} ({failed_test/total_examples:.1%})")
    print(f"   Grid-level accuracy: {correct_test_grids}/{total_test_grids} ({correct_test_grids/total_test_grids:.1%})")
    
    # Detailed breakdown by example
    print(f"\nüìã DETAILED BREAKDOWN BY EXAMPLE:")
    print("-" * 60)
    
    for i, result in enumerate(results):
        task_id = result["task_id"]
        train_rate = result["train_success"]
        test_rate = result["test_success"]
        executed = "‚úÖ" if result["code_executed"] else "‚ùå"
        error_count = len(result["errors"])
        
        print(f"[{i+1:2d}] {task_id}")
        print(f"     Train: {train_rate:5.1%} | Test: {test_rate:5.1%} | Executed: {executed} | Errors: {error_count}")
        
        if result["errors"] and len(result["errors"]) <= 3:  # Show first few errors
            for error in result["errors"][:3]:
                print(f"     Error: {error}")
        elif len(result["errors"]) > 3:
            print(f"     Errors: {result['errors'][0]} ... (+{len(result['errors'])-1} more)")
    
    # Quality assessment
    print(f"\nüîç DATASET QUALITY ASSESSMENT:")
    print("-" * 60)
    
    if avg_train_success > 0.9:
        print("‚úÖ EXCELLENT: Ground-truth code performs very well on training examples")
    elif avg_train_success > 0.7:
        print("‚úÖ GOOD: Ground-truth code performs well on training examples")
    elif avg_train_success > 0.5:
        print("‚ö†Ô∏è  MODERATE: Ground-truth code has mixed performance on training examples")
    else:
        print("‚ùå POOR: Ground-truth code has low performance on training examples")
    
    if avg_test_success > 0.9:
        print("‚úÖ EXCELLENT: Ground-truth code generalizes very well to test examples")
    elif avg_test_success > 0.7:
        print("‚úÖ GOOD: Ground-truth code generalizes well to test examples")
    elif avg_test_success > 0.5:
        print("‚ö†Ô∏è  MODERATE: Ground-truth code has mixed generalization to test examples")
    else:
        print("‚ùå POOR: Ground-truth code has poor generalization to test examples")
    
    if examples_with_executable_code == total_examples:
        print("‚úÖ EXCELLENT: All ground-truth code is executable")
    elif examples_with_executable_code / total_examples > 0.9:
        print("‚úÖ GOOD: Most ground-truth code is executable")
    else:
        print("‚ö†Ô∏è  ISSUE: Some ground-truth code is not executable")
    
    print("\n" + "=" * 60)
    
    return {
        "total_examples": total_examples,
        "executable_rate": examples_with_executable_code / total_examples,
        "avg_train_success": avg_train_success,
        "avg_test_success": avg_test_success,
        "perfect_train_rate": perfect_train / total_examples,
        "perfect_test_rate": perfect_test / total_examples,
        "train_grid_accuracy": correct_train_grids / total_train_grids if total_train_grids > 0 else 0,
        "test_grid_accuracy": correct_test_grids / total_test_grids if total_test_grids > 0 else 0
    }

# Analyze the results
summary_stats = analyze_data_integrity_results(data_integrity_results)


üìà PRE-TRAINING DATA INTEGRITY RESULTS ANALYSIS

üéØ OVERALL PERFORMANCE:
   Examples tested: 128
   Code executable: 126/128 (98.4%)
   Examples with errors: 5/128 (3.9%)

üìä TRAINING GRIDS PERFORMANCE:
   Average success rate: 29.3%
   Perfect examples (100%): 8/128 (6.2%)
   Partial examples (>0% <100%): 92/128 (71.9%)
   Failed examples (0%): 28/128 (21.9%)
   Grid-level accuracy: 127/408 (31.1%)

üéØ TEST GRIDS PERFORMANCE:
   Average success rate: 93.8%
   Perfect examples (100%): 120/128 (93.8%)
   Partial examples (>0% <100%): 0/128 (0.0%)
   Failed examples (0%): 8/128 (6.2%)
   Grid-level accuracy: 120/132 (90.9%)

üìã DETAILED BREAKDOWN BY EXAMPLE:
------------------------------------------------------------
[ 1] b60334d2
     Train:  0.0% | Test: 100.0% | Executed: ‚úÖ | Errors: 0
[ 2] 21f83797
     Train:  0.0% | Test: 100.0% | Executed: ‚úÖ | Errors: 0
[ 3] 0962bcdd
     Train:  0.0% | Test: 100.0% | Executed: ‚úÖ | Errors: 0
[ 4] d037b0a7
     Train: 33.3% | Test:

In [26]:
def examine_failure(results, example_index):
  """Examine a specific failing example in detail with grid visualization."""
  if example_index >= len(results):
      print(f"‚ùå Invalid index {example_index}. Only {len(results)} examples available.")
      return

  result = results[example_index]
  dataset_example = data["train"][result['index']]  # Get the original dataset example

  print(f"\nüîç DETAILED EXAMINATION: Example {example_index + 1}")
  print(f"Task ID: {result['task_id']}")
  print(f"Dataset Index: {result['index']}")
  print("=" * 70)

  print(f"\nüìù GROUND TRUTH CODE:")
  print("-" * 30)
  print(result['code'])

  print(f"\nüìä EXECUTION SUMMARY:")
  print(f"Code executed successfully: {result['code_executed']}")
  print(f"Train success rate: {result['train_success']:.1%}")
  print(f"Test success rate: {result['test_success']:.1%}")
  print(f"Number of errors: {len(result['errors'])}")

  if result['errors']:
      print(f"\n‚ùå ERRORS:")
      for i, error in enumerate(result['errors']):
          print(f"  {i+1}. {error}")

  # Load original task for ground truth comparison
  task_loader = TaskLoader()
  try:
      original_task = task_loader.get_task(result['task_id'])
  except Exception as e:
      print(f"‚ùå Could not load original task: {e}")
      return

  def print_grid(grid, title):
      """Helper to print a grid nicely."""
      print(f"\n{title}:")
      if grid is None:
          print("  None")
          return
      for row in grid:
          print("  " + " ".join(f"{cell:2d}" for cell in row))

  # Examine training examples
  print(f"\nüèãÔ∏è TRAINING EXAMPLES:")
  print("=" * 50)

  for i, train_result in enumerate(result['train_results']):
      print(f"\nTrain Example {i + 1}: {'‚úÖ CORRECT' if train_result['correct'] else '‚ùå INCORRECT'}")
      print("-" * 40)

      # Input (should be same from dataset and original)
      dataset_input = dataset_example["train_input"][i]
      original_input = original_task["train"][i]["input"]
      print_grid(dataset_input, "Input (from dataset)")
      if dataset_input != original_input:
          print_grid(original_input, "Input (from original) - MISMATCH!")

      # Expected output (from dataset - might be predicted)
      dataset_expected = dataset_example["train_output"][i]
      original_expected = original_task["train"][i]["output"]
      print_grid(dataset_expected, "Expected (from dataset)")
      if dataset_expected != original_expected:
          print_grid(original_expected, "Expected (ground truth) - DIFFERENT!")

      # Predicted output (from code execution)
      if 'predicted' in train_result:
          print_grid(train_result['predicted'], "Predicted (from code)")

      if 'error' in train_result:
          print(f"\n‚ùå Execution Error: {train_result['error']}")

  # Examine test examples
  print(f"\nüß™ TEST EXAMPLES:")
  print("=" * 50)

  for i, test_result in enumerate(result['test_results']):
      print(f"\nTest Example {i + 1}: {'‚úÖ CORRECT' if test_result['correct'] else '‚ùå INCORRECT'}")
      print("-" * 40)

      # Input
      dataset_input = dataset_example["test_input"][i]
      original_input = original_task["test"][i]["input"]
      print_grid(dataset_input, "Input (from dataset)")
      if dataset_input != original_input:
          print_grid(original_input, "Input (from original) - MISMATCH!")

      # Expected output
      dataset_expected = dataset_example["test_output"][i]
      original_expected = original_task["test"][i].get("output")  # Might be None
      print_grid(dataset_expected, "Expected (from dataset)")
      if original_expected and dataset_expected != original_expected:
          print_grid(original_expected, "Expected (ground truth) - DIFFERENT!")
      elif not original_expected:
          print("Expected (ground truth): No ground truth available")

      # Predicted output
      if 'predicted' in test_result:
          print_grid(test_result['predicted'], "Predicted (from code)")

      if 'error' in test_result:
          print(f"\n‚ùå Execution Error: {test_result['error']}")

# Check for failing examples
failed_examples = [i for i, r in enumerate(data_integrity_results)
                if r['train_success'] < 1.0 or r['test_success'] < 1.0 or not r['code_executed']]

print(f"\nüîç FAILING EXAMPLES SUMMARY:")
if failed_examples:
  print(f"Found {len(failed_examples)} examples with issues: {failed_examples}")
  print("To examine a specific failure, run: examine_failure(data_integrity_results, index)")
  print("Example: examine_failure(data_integrity_results, 0)")
else:
  print("üéâ No failing examples found! All ground-truth code works perfectly.")

print(f"\n‚úÖ Pre-training data integrity tests complete!")
print(f"üìã Summary stats saved in 'summary_stats' variable")
print(f"üìä Detailed results saved in 'data_integrity_results' variable")


üîç FAILING EXAMPLES SUMMARY:
Found 120 examples with issues: [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127]
To examine a specific failure, run: examine_failure(data_integrity_results, index)
Example: examine_failure(data_integrity_results, 0)

‚úÖ Pre-training data integrity tests complete!
üìã Summary stats saved in 'summary_stats' variable
üìä Detailed results saved in 'data_integrity_results' variable


In [27]:
examine_failure(data_integrity_results, 4)


üîç DETAILED EXAMINATION: Example 5
Task ID: 4c5c2cf0
Dataset Index: 4506

üìù GROUND TRUTH CODE:
------------------------------
def transform(grid: list[list[int]]) -> list[list[int]]:

    h = len(grid)
    w = len(grid[0]) if h else 0
    new_grid = [row[:] for row in grid]

    from collections import defaultdict
    coords = defaultdict(list)
    for r in range(h):
        for c in range(w):
            v = grid[r][c]
            if v != 0:
                coords[v].append((r, c))

    cross_color = None
    for color, pts in coords.items():
        if len(pts) == 5:
            cross_color = color
            break
    if cross_color is None:

        return new_grid

    aux_colors = [col for col in coords if col != cross_color]

    cross_pts = coords[cross_color]
    minr = min(r for r, _ in cross_pts)
    maxr = max(r for r, _ in cross_pts)
    minc = min(c for _, c in cross_pts)
    maxc = max(c for _, c in cross_pts)
    center_r = (minr + maxr) // 2
    center_c = (minc

### Training Setup
Now we'll set up the trainer and then validate all evaluation components.

In [28]:
from datetime import datetime
import re

# Extract date and time using regex
print("Extract training set date and time as dataset identifiers")

# Try pattern 1: timestamp at end (original pattern)
match = re.search(r'(\d{8}_\d{6})', train_slug)
if match:
  timestamp = match.group(1)
  date_str = timestamp[:8]
  time_str = timestamp[9:]
  print(f"Date: {date_str} (YYYYMMDD)")
  print(f"Time: {time_str} (HHMMSS)")
else:
  # Try pattern 2: SOAR dataset format (soar-YYYYMMDD_HHMMSS-rows)
  match = re.search(r'soar-(\d{8}_\d{6})-\d+', train_slug)
  if match:
      timestamp = match.group(1)
      date_str = timestamp[:8]
      time_str = timestamp[9:]
      print(f"Date: {date_str} (YYYYMMDD)")
      print(f"Time: {time_str} (HHMMSS)")
  else:
      # No timestamp found - use dataset name
      print("No timestamp found, using dataset name.")
      dataset_name = train_slug.split('/')[-1]  # Get name part after last slash
      date_str = dataset_name
      time_str = ""

run_name = f"{model_slug.split('/')[-1]}_ds-{dataset_name}"

if test_run:
    run_name = run_name + "_test"   # or "-test"

    
print(f"Run name will be {run_name}")

Extract training set date and time as dataset identifiers
No timestamp found, using dataset name.
Run name will be Qwen3-4B_ds-arc-agi-2-perfect-50


In [29]:
# Legacy code extraction imports - now using utils.prompt_utils

In [30]:
import torch, subprocess, os, gc, time

def _print_gpu(prefix=""):
    alloc = torch.cuda.memory_allocated() / 2**20  # MiB
    reserved = torch.cuda.memory_reserved() / 2**20
    print(f"{prefix}CUDA‚Äëalloc={alloc:.0f} MiB | reserved={reserved:.0f} MiB")

def _nvidia_smi():
    try:
        smi = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used,memory.free",
             "--format=csv,noheader,nounits"]).decode().strip()
        print("nvidia-smi (used/free MiB):", smi)
    except Exception:
        pass  # nvidia-smi not always available


TEMPLATES = {
    "llama": (
        "<|start_header_id|>user<|end_header_id|>\n\n",
        "<|start_header_id|>assistant<|end_header_id|>\n\n",
    ),
    "gemma": (
        "<start_of_turn>user\n",
        "<start_of_turn>model\n",
    ),
    "qwen-coder": (
        "<|im_start|>user\n",
        "<|im_start|>assistant\n", # this is actually how you properly allow the model to keep reasoning!
    ),
    "qwen": (
        "<|im_start|>user\n",
        "<|im_start|>assistant\n<think>\n\n</think>\n\n", # this is actually how you properly allow the model to keep reasoning!
    ),
    "mistral": (
        "[INST]",
        "[/INST]",
    )
}

# instruction_tag, response_tag = TEMPLATES["qwen-coder"]   # ‚Üê change if needed and comment out below

model_slug_lower = model_slug.lower()

if "qwen" in model_slug_lower:
    if "coder" in model_slug_lower:
        instruction_tag, response_tag = TEMPLATES["qwen-coder"]
    elif "soar-qwen" in model_slug_lower:
        instruction_tag, response_tag = TEMPLATES["qwen-coder"]
    else:
        instruction_tag, response_tag = TEMPLATES["qwen"]
else:
    raise ValueError(f"Unsupported model slug for Qwen template: {model_slug}")

In [31]:
print(f"Response tag selected: {response_tag}")

Response tag selected: <|im_start|>assistant
<think>

</think>




In [32]:
from trl import SFTTrainer, SFTConfig
import math
from torch.optim.lr_scheduler import LambdaLR

setattr(model, "_flag_for_generation", True)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=data["train"],
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=batch_size_global,
        gradient_accumulation_steps=int(32 / batch_size_global),
        num_train_epochs=2,
        learning_rate=1e-4,
        logging_strategy="steps",
        logging_steps=0.0125,         # keep as FRACTION of an epoch
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="constant", # ignored after we inject
        seed=3407,
        report_to="tensorboard",
        logging_dir=f"./logs/{run_name}",
        remove_unused_columns=True,
        save_strategy="steps",
        save_steps=0.5,              # keep as FRACTION of an epoch
        save_total_limit=4,
        prediction_loss_only=False,
        hub_model_id=f"Trelis/{run_name}-trainer",  # ‚Üê this sets the repo to push to
        hub_strategy="all_checkpoints",         # when to push (end, every_save, checkpoint, all_checkpoints)
        hub_private_repo=True,             # optional: make it private
        push_to_hub=True
    )
)

# --- derive counts (without mutating args) ---
train_dl = trainer.get_train_dataloader()
ga = trainer.args.gradient_accumulation_steps
updates_per_epoch = max(1, math.ceil(len(train_dl) / ga))
total_updates = updates_per_epoch * trainer.args.num_train_epochs

def _effective_interval(val):
    """Use fractions (<1) as fraction-of-epoch; ints as-is. Do NOT mutate args."""
    if isinstance(val, float) and 0.0 < val < 1.0:
        return max(1, int(round(val * total_updates)))

# Internal interval used ONLY for LR dips (trainer keeps fractions)
effective_save_interval = _effective_interval(trainer.args.save_steps)

# Build save marks in optimizer-step indices
save_marks = list(range(effective_save_interval, total_updates + 1, effective_save_interval))

# 10% of ONE epoch for warmup/dip windows
window = max(1, int(round(0.1 * updates_per_epoch)))
min_frac = 0.1

def lr_multiplier(step_idx: int) -> float:
    # initial warmup
    if step_idx < window:
        return (step_idx + 1) / float(window)
    # dip before save, recover after
    for s in save_marks:
        if (s - window) <= step_idx < s:      # down-ramp
            pos = step_idx - (s - window)
            return 1.0 - (1.0 - min_frac) * ((pos + 1) / float(window))
        if s <= step_idx < (s + window):      # up-ramp
            pos = step_idx - s
            return min_frac + (1.0 - min_frac) * ((pos + 1) / float(window))
    return 1.0

# Inject optimizer & custom scheduler (Unsloth-safe)
trainer.create_optimizer()
optimizer = trainer.optimizer
scheduler = LambdaLR(optimizer, lr_lambda=lambda step: lr_multiplier(step))

trainer.optimizer = optimizer
trainer.lr_scheduler = scheduler
trainer.create_optimizer = lambda *a, **k: trainer.optimizer
trainer.create_scheduler = lambda *a, **k: trainer.lr_scheduler

print(f"[setup] updates/epoch={updates_per_epoch} total_updates={total_updates} "
      f"save_steps(raw)={trainer.args.save_steps} "
      f"effective_save_interval(steps)={effective_save_interval} "
      f"output_dir={trainer.args.output_dir}")

[setup] updates/epoch=485 total_updates=970 save_steps(raw)=0.5 effective_save_interval(steps)=485 output_dir=trainer_output


In [33]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H200. Max memory = 139.719 GB.
8.719 GB of memory reserved.


In [34]:
# We should consider training on completions only!!! which means the response part for the xentropy.

In [35]:
from unsloth.chat_templates import train_on_responses_only # or run the code above if not using unsloth

# TO SUPPORT REASONING, WE NEED TO DYNAMICALLY APPLY THE RIGHT MASKING, NOT YET IMPLEMENTED
# masks everything between the instruction_part and response_part
trainer = train_on_responses_only(
    trainer,
    instruction_part = instruction_tag,
    response_part = response_tag,
    # force_match=False # comment out to set true for a cleaner masking
)

Map (num_proc=160): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15506/15506 [00:03<00:00, 4043.02 examples/s] 


In [36]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

"<|im_start|>system\nYou are an AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks by reasoning and generating Python code.<|im_end|>\n<|im_start|>user\nYou are an AI assistant specialized in solving Abstract Reasoning Corpus (ARC-AGI) tasks by generating Python code.\nYour goal is to analyze input-output grid pairs. The outputs were produced by applying a transformation rule to the inputs. Implement the transformation rules as a Python function.\nYou should only write the implemented the transformation in code.\nYou must write code in triple backticks (```python and then ```). You must write a function called 'transform' which takes a single argument, the input grid as 'list[list[int]]', and returns the transformed grid (also as 'list[list[int]]').\nYou should make sure that you implement a version of the transformation which works in general (at least for all given input-output pairs and test input pairs).\nThe number in the input grid can be mapped to the 

In [37]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[0]["labels"]]).replace(tokenizer.pad_token, " ")

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 ```python\ndef transform(grid):\n\n    top = [row * 3 for row in grid]                     \n    middle = [list(reversed(row)) * 3 for row in grid] \n\n    return top + middle + top\n```<|im_end|>\n'

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 15,506 | Num Epochs = 2 | Total steps = 970
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 264,241,152 of 4,286,709,248 (6.16% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
13,0.5808


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
print(trainer_stats)

In [None]:
print(f"Current working directory: {os.getcwd()}")
print(f"Trainer output dir: {trainer.args.output_dir}")
print(f"Checkpoints exist: {os.listdir(trainer.args.output_dir)}")
trainer.push_to_hub()

In [None]:
# Way to push the final model (lora)
# trainer.push_to_hub(dataset_name=train_slug)
# stop

<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

In [None]:
# print(tokenizer.chat_template)

In [None]:
# data["validation"]['prompt'][0]

In [None]:
# messages = [
#     {"role" : "system", "content" : "You are an expert at solving abstract reasoning puzzles. Write clean, efficient Python code."},
#     {"role" : "user", "content" : "You are solving an ARC (Abstraction and Reasoning Corpus) task. \nI will show you training examples with input and output grids, plus a test input grid. Your task is to:\n\n1. **Analyze the training examples** to discover patterns that map input grids to output grids\n2. **Write a Python program** that implements your best understanding of the transformation  \n3. **DO NOT predict or generate the test output** - your job is only to write the transformation program\n4. **Attempt a solution** - even if the pattern isn't completely clear, provide your best hypothesis\n5. **Do not repeat the same transformation** - if you have already tried a transformation, do not repeat it.\n\n**IMPORTANT: Your transformation must always produce a 10\u00d710 output grid.**\n\nThe test input is shown for context so you understand what type of grid your program will eventually process. Focus on learning patterns from training examples and writing code that captures your understanding.\n\nTraining Examples:\n\nExample 1:\nInput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n5 0 0 5 0 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n2 0 0 2 0 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 2:\nInput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 5 0 5 5 0 0 5 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 2 0 2 2 0 0 2 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n\nExample 3:\nInput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\nOutput:\n0 0 5 5 0 5 0 5 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n0 0 2 2 0 2 0 2 2 5\n0 0 0 0 0 0 0 0 0 0\n\nTest Input:\n5 0 5 5 0 0 5 0 5 0\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 5\n\nAnalyze the patterns in the training examples and write a Python function that performs this transformation.\n\n**Approach Guidelines:**\n- Look for patterns in shapes, colors, positions, sizes, rotations, reflections, etc.\n- Even if you can't solve all training examples perfectly, implement what patterns you do observe\n- A partial solution that captures some aspects is better than returning the input unchanged\n- If the pattern is unclear, make your best educated guess based on what you can see\n\nRequirements:\n- The function takes a 2D list (grid) where grid[row][col] gives the value at that position\n- Values are integers from 0-9\n- Return a new grid (2D list) with the transformation applied\n- You can use numpy if needed - just add 'import numpy as np' at the start of your function\n- Aim to handle the training examples as well as possible, even if not perfectly\n- Your function should attempt some meaningful transformation based on the patterns you observe\n\nYou MUST end your response with the following exact format:\n\nFinal answer:\n```python\ndef transform(grid):\n    # Your transformation logic here (implement your best understanding)\n    return transformed_grid\n```\n"}
# ]
# text = tokenizer.apply_chat_template(
#     messages,
#     tokenize = False,
#     add_generation_prompt = True, # Must add for generation
#     enable_thinking = False, # Disable thinking
# )

# # from transformers import TextStreamer
# # _ = model.generate(
# #     **tokenizer(text, return_tensors = "pt").to("cuda"),
# #     max_new_tokens = 8000, # Increase for longer outputs!
# #     # temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
# #     temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
# #     # temperature = 0.01,
# #     streamer = TextStreamer(tokenizer, skip_prompt = True),
# # )

# # text = data["validation"]['prompt'][0]

# from transformers import TextStreamer

# inputs = tokenizer(text, return_tensors="pt").to("cuda")
# input_ids = inputs["input_ids"]  # Extract for convenience

# output_ids = model.generate(
#     **inputs,
#     max_new_tokens=2000,
#     # temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
#     # temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
#     temperature=0.1, # BEST FOR SINGLE ATTEMPTS
# )

# # Slice to skip the prompt portion in output
# generated_tokens = output_ids[0][input_ids.shape[-1]:]
# generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

# print(generated_text)

In [None]:
# # Use extract_python_code from utils (SOAR approach)
# code = extract_python_code(generated_text)

# if code:
#     print(code)
#     exec(code, globals())  # Defines `transform()` in global scope
# else:
#     raise ValueError("Could not extract Python code from generated text")

In [None]:
# Alternative transform implementations commented out - using model generated version above


In [None]:
# import matplotlib.pyplot as plt
# import numpy as np

# # -------------------- helper --------------------
# def safe_transform(grid):
#     grid = grid.copy()        # <‚Äë‚Äë clone so the original stays unchanged
#     try:
#         return transform(grid)
#     except Exception as err:
#         print(f"[safe_transform] transform() failed ‚Äì {err}")
#         return np.zeros_like(grid)

# # -------------------- test case -----------------
# test_case = {
#     "input": np.array([  # convert to np.array for convenience
#         [5, 0, 5, 5, 0, 0, 5, 0, 5, 0],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 5]
#     ]),
#     "output": np.array([
#         [5, 0, 5, 5, 0, 0, 5, 0, 5, 0],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
#         [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [2, 0, 2, 2, 0, 0, 2, 0, 2, 5],
#         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#         [2, 0, 2, 2, 0, 0, 2, 0, 2, 5]
#     ])
# }

# # -------------------- run & plot ----------------
# predicted_output = safe_transform(test_case["input"])

# fig, axs = plt.subplots(1, 3, figsize=(12, 4))
# titles = ["Input", "Predicted Output", "Ground Truth Output"]
# grids  = [test_case["input"], predicted_output, test_case["output"]]

# for ax, grid, title in zip(axs, grids, titles):
#     im = ax.imshow(grid, cmap="viridis", vmin=0, vmax=5)
#     ax.set_title(title)
#     ax.axis("off")

# plt.tight_layout()
# plt.show()

In [None]:
# Inference testing section ends here

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
# !pip show transformers

In [None]:
# model.save_pretrained("lora_model")  # Local saving
# tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

You can use this also to load a checkpoint!!! i.e. an intermediate checkpoint from training, so you can then push it to hub.

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
## TO MANUALLY LOAD A LORA AND THEN MERGE AND PUSH
# import os
# import unsloth
# from unsloth import FastLanguageModel
# import torch

# checkpoint = 

# if True:
#     from unsloth import FastLanguageModel
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         # model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
#         model_name = f"trainer_output/checkpoint-{checkpoint}",
#         # max_seq_length = 30000,
#         load_in_4bit = False,
#     )
#     run_name = "Qwen3-4B_dsarc-programs-50-full-200-partial_20250807-211749"
#     lora_run_name = run_name + f"-c{checkpoint}"
#     print(f"Pushing to Trelis/{lora_run_name}")
#     model = model.merge_and_unload()
#     model.push_to_hub(f"Trelis/{lora_run_name}")
#     tokenizer.push_to_hub(f"Trelis/{lora_run_name}")

In [None]:
# To merge and push each checkpoint after the run [DEFAULT BEHAVIOUR]
import os, re, torch
from unsloth import FastLanguageModel

ROOT = "trainer_output"
RUN_NAME = run_name
# RUN_NAME = "Qwen3-4B_dsarc-programs-50-full-200-partial_20250807-211749"

# collect checkpoint dirs like checkpoint-12345 and sort by step
ckpts = []
for d in os.listdir(ROOT):
    m = re.fullmatch(r"checkpoint-(\d+)", d)
    if m and os.path.isdir(os.path.join(ROOT, d)):
        ckpts.append((int(m.group(1)), os.path.join(ROOT, d)))
ckpts.sort(key=lambda x: x[0])  # ascending; use reverse=True for newest first

print(f"Found {len(ckpts)} checkpoints:", [s for s, _ in ckpts])

for step, path in ckpts:
    try:
        print(f"\n=== STEP {step} === Loading {path}")
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = path,
            load_in_4bit = False,
            # device_map = "auto",   # uncomment if you have GPU available
        )
        repo_id = f"Trelis/{RUN_NAME}-c{step}"
        print(f"Pushing to {repo_id} ‚Ä¶")
        # If you trained with LoRA, keep merge_and_unload(); if full-finetune, drop this line.
        model = model.merge_and_unload()
        model.push_to_hub(repo_id)
        tokenizer.push_to_hub(repo_id)

        # tidy up between checkpoints
        del model
        del tokenizer
        try:
            torch.cuda.empty_cache()
        except Exception:
            pass

    except Exception as e:
        print(f"[WARN] Skipping checkpoint {step}: {e}")

In [None]:
## MANUAL MERGE AFTER THE FACT.
# # pip install -U huggingface_hub transformers peft
# import re, gc, torch
# from huggingface_hub import HfApi
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel

# BASE_ID  = "Qwen/Qwen3-4B"
# SRC_REPO = "Trelis/Qwen3-4B_dsarc-programs-50-full-200-incorrect_20250808-134330-trainer"
# REVISION = "main"
# OUT_NS   = "Trelis"
# PRIVATE  = True

# api = HfApi()  # uses HF_TOKEN if set

# def find_checkpoints(repo_id: str, revision: str = "main"):
#     files = api.list_repo_files(repo_id=repo_id, repo_type="model", revision=revision)
#     steps = sorted({int(m.group(1)) for p in files if (m := re.match(r"^checkpoint-(\d+)/", p))})
#     return [f"checkpoint-{s}" for s in steps], files

# def load_tokenizer_for_checkpoint(repo_id: str, subfolder: str):
#     # Try subfolder tokenizer -> repo root tokenizer -> base tokenizer
#     try:
#         return AutoTokenizer.from_pretrained(
#             repo_id, subfolder=subfolder, revision=REVISION, trust_remote_code=True, use_fast=True
#         )
#     except Exception:
#         try:
#             return AutoTokenizer.from_pretrained(
#                 repo_id, revision=REVISION, trust_remote_code=True, use_fast=True
#             )
#         except Exception:
#             return AutoTokenizer.from_pretrained(BASE_ID, trust_remote_code=True, use_fast=True)

# ckpt_dirs, files = find_checkpoints(SRC_REPO, REVISION)
# print("Found checkpoints:", ckpt_dirs)
# if not ckpt_dirs:
#     print("[DEBUG] sample files:", files[:20])

# for sub in ckpt_dirs:
#     step = int(sub.split("-")[1])
#     out_repo = f"{OUT_NS}/{SRC_REPO.split('/',1)[1].replace('-trainer','')}-c{step}"
#     print(f"\n=== {SRC_REPO}/{sub} -> {out_repo} ===")

#     # Load base each time (avoid stacking adapters)
#     base = AutoModelForCausalLM.from_pretrained(
#         BASE_ID, torch_dtype="auto", device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True
#     )

#     # Attach LoRA from checkpoint subfolder
#     peft = PeftModel.from_pretrained(
#         base, model_id=SRC_REPO, subfolder=sub, adapter_name=f"ckpt{step}"
#     )
#     peft.set_adapter(f"ckpt{step}")

#     # Load tokenizer *for this checkpoint*
#     tok = load_tokenizer_for_checkpoint(SRC_REPO, sub)

#     # Optional safety: warn if tokenizer size > embeddings size
#     try:
#         vocab = len(tok)
#         emb = peft.base_model.get_input_embeddings().weight.shape[0]
#         if vocab > emb:
#             print(f"[WARN] tokenizer {vocab} > embeddings {emb} ‚Äî consider resize or base tokenizer")
#         else:
#             print(f"[OK] tokenizer {vocab} <= embeddings {emb} (padding: {emb - vocab})")
#     except Exception:
#         pass

#     # Bake LoRA into base
#     merged = peft.merge_and_unload()  # returns plain Transformers model

#     # Push baked model + that checkpoint's tokenizer
#     merged.push_to_hub(out_repo, private=PRIVATE)
#     tok.push_to_hub(out_repo)
#     print(f"Pushed {out_repo}")

#     del merged, peft, base, tok
#     gc.collect()
#     try: torch.cuda.empty_cache()
#     except: pass


In [None]:
# # Multi-LoRA merge
# # --- 1) Merge the two LoRA adapters with TIES in plain PEFT -------------------
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel

# base_id = "Qwen/Qwen3-4B"

# # Your two adapters (repos + checkpoint subfolders)
# a1_repo = "Trelis/Qwen3-4B_dsarc-programs-50-full-200-incorrect_20250808-134330-trainer"
# a1_sub  = "checkpoint-2874"
# a1_name = "incorrect2874"

# a2_repo = "Trelis/Qwen3-4B_dsarc-programs-50-full-200-partial_20250807-211749-trainer"
# a2_sub  = "checkpoint-2114"
# a2_name = "partial2114"

# merged_name = f"{a1_name}__{a2_name}_linear"
# out_dir     = f"adapters/qwen3-4b_{merged_name}"
# baked_dir   = f"models/qwen3-4b_{merged_name}_baked"

# # Load base in vanilla Transformers (keeps PEFT ops simple/robust)
# base = AutoModelForCausalLM.from_pretrained(
#     base_id,
#     torch_dtype=torch.bfloat16,   # or "auto"
#     device_map="auto",
#     trust_remote_code=True,       # Qwen models often expect this
# )

# # Attach both adapters
# model = PeftModel.from_pretrained(
#     base,
#     model_id=a1_repo,
#     subfolder=a1_sub,
#     adapter_name=a1_name,
# )
# model.load_adapter(
#     model_id=a2_repo,
#     subfolder=a2_sub,
#     adapter_name=a2_name,
# )

# # TIES merge
# model.add_weighted_adapter(
#     adapters=[a1_name, a2_name],
#     weights=[1.0, 1.0],             # bias later if desired (e.g., [0.7, 0.3])
#     adapter_name=merged_name,
#     combination_type="linear",
#     # density=0.5,                    # good starting point; try 0.2‚Äì0.5
#     # majority_sign_method="total",   # preserves stronger side on conflicts
# )
# model.set_adapter(merged_name)

# # # Save ONLY the merged adapter (compact; ~LoRA size)
# # model.save_pretrained(out_dir, selected_adapters=[merged_name])
# # print("Saved merged adapter to:", out_dir)

# # (Optional) Bake into base weights to get a standalone merged model
# baked = model.merge_and_unload()    # merges the *active* adapter into base
# # baked.save_pretrained(baked_dir)
# # print("Saved baked model to:", baked_dir)

# baked = model.merge_and_unload()
# baked.push_to_hub(f"Trelis/{merged_name}")

# tokenizer = AutoTokenizer.from_pretrained(
#             a1_repo, subfolder=a1_sub
#         )
# tokenizer.push_to_hub(f"Trelis/{merged_name}")


### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
from huggingface_hub import HfFolder, login

# Call this at the top of your script / notebook
if HfFolder.get_token() is None:   # no token cached or in $HF_TOKEN
    login()                        # interactive prompt

In [None]:
# print(model)

In [None]:
# # run_name = "Qwen2.5-Coder-7B-Instruct-gemini_synth_50_random_split_1_training-20250723-113848"
# print(f"Pushing to Trelis/{run_name}")

In [None]:
# model = model.merge_and_unload()
# model.push_to_hub(f"Trelis/{run_name}")
# tokenizer.push_to_hub(f"Trelis/{run_name}")

In [None]:
# # Merge to 16bit
# if False:
#     model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# if True: # Pushing to HF Hub
#     model.push_to_hub_merged(f"Trelis/{run_name}", tokenizer, save_method = "merged_16bit")

# # Merge to 4bit
# if False:
#     model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
# if False: # Pushing to HF Hub
#     model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# # Just LoRA adapters
# if False:
#     model.save_pretrained("model")
#     tokenizer.save_pretrained("model")
# if False: # Pushing to HF Hub
#     model.push_to_hub("hf/model", token = "")
#     tokenizer.push_to_hub("hf/model", token = "")


In [None]:
# print(tokenizer.chat_template)