<a href="https://colab.research.google.com/github/ProfSynapse/Toolset-Training/blob/main/kto_colab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Installation and Imports
Install required packages including unsloth and flash-attention, and import necessary libraries for the KTO finetuning process.

In [None]:
# Install required packages
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

# Import necessary libraries
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import os
import re
from typing import List, Literal, Optional
from datasets import load_dataset
from trl import KTOConfig, KTOTrainer
from transformers import TrainingArguments

# Model Loading and Configuration
Load the pre-trained model and tokenizer using FastLanguageModel, and configure basic parameters like sequence length and quantization settings.

In [None]:
# Model Loading and Configuration

# Set basic parameters
max_seq_length = 4096  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.


# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gpt-oss-20b-unsloth-bnb-4bit",  # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Add proper chat template if missing
if tokenizer.chat_template is None:
    DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
    tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

==((====))==  Unsloth 2024.12.1: Fast Qwen2 patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Dataset Preparation and Processing
Load and prepare the Claudesidian synthetic dataset from Hugging Face Hub. The dataset contains 1,000 ChatML-formatted examples with boolean labels (true=desirable, false=undesirable). These are converted to KTO's chosen/rejected format and paired for contrastive learning.

In [None]:
# Dataset Preparation and Processing

# Load the Claudesidian synthetic dataset from Hugging Face
raw_datasets = load_dataset(
    "professorsynapse/claudesidian-synthetic-dataset",
    data_files="syngen_toolset_v1.0.0_claude.jsonl"
)
train_dataset = raw_datasets["train"]

print("=== Dataset Info ===")
print(f"Total examples: {len(train_dataset)}")

# KTO trainer expects: prompt, completion, label format
def prepare_kto_format(example):
    """Convert ChatML format to KTO format."""
    conversations = example["conversations"]
    label = example["label"]
    
    # Extract user and assistant messages
    user_msgs = [msg for msg in conversations if msg["role"] == "user"]
    assistant_msgs = [msg for msg in conversations if msg["role"] == "assistant"]
    
    if not user_msgs or not assistant_msgs:
        return None
    
    prompt = user_msgs[0]["content"]
    completion = assistant_msgs[0]["content"]
    
    return {
        "prompt": prompt,
        "completion": completion,
        "label": label
    }

# Process all examples
processed_dataset = []
for example in train_dataset:
    result = prepare_kto_format(example)
    if result is not None:
        processed_dataset.append(result)

print(f"Processed examples: {len(processed_dataset)}")

# Count distribution
chosen_count = sum(1 for ex in processed_dataset if ex["label"] == True)
rejected_count = sum(1 for ex in processed_dataset if ex["label"] == False)

print(f"\nDataset distribution:")
print(f"  Desirable (True): {chosen_count}")
print(f"  Undesirable (False): {rejected_count}")
print(f"  Ratio: {chosen_count / rejected_count:.2f}:1")

# Create HuggingFace dataset - NO INTERLEAVING NEEDED
# KTO handles imbalanced data via desirable_weight and undesirable_weight
from datasets import Dataset as HFDataset
train_subset = HFDataset.from_dict({
    "prompt": [ex["prompt"] for ex in processed_dataset],
    "completion": [ex["completion"] for ex in processed_dataset],
    "label": [ex["label"] for ex in processed_dataset],
})

print(f"\nFinal KTO dataset: {len(train_subset)} examples")
print(f"Columns: {train_subset.column_names}")
print(f"\nSample:")
print(f"  Prompt: {train_subset['prompt'][0][:50]}...")
print(f"  Completion: {train_subset['completion'][0][:50]}...")
print(f"  Label: {train_subset['label'][0]}")

# Model Training Setup
Configure the LoRA adapters and set up the KTO trainer with appropriate training arguments. The trainer uses the unpaired Claudesidian dataset (746 desirable / 254 undesirable examples) and applies weighted loss (`desirable_weight=1.0`, `undesirable_weight=2.94`) to handle the 2.94:1 imbalance, achieving ~1:1 effective loss contribution from each class.

In [None]:
# Model Training Setup

# Configure the LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# Prepare model for training
model.config.use_cache = False

# Calculate weight balance for imbalanced dataset
# KTO natively supports imbalanced data via desirable_weight and undesirable_weight
# Current ratio: 746 desirable : 254 undesirable = 2.94:1
# To reach effective 1:1 balance: undesirable_weight = 2.94
# This means: (1.0 * 746) / (2.94 * 254) ≈ 1:1 contribution to loss
desirable_count = 746
undesirable_count = 254
balance_ratio = desirable_count / undesirable_count
desirable_weight = 1.0
undesirable_weight = balance_ratio  # ~2.94 to achieve ~1:1 loss balance

print(f"Dataset Balance Calculation:")
print(f"  Desirable: {desirable_count}, Undesirable: {undesirable_count}")
print(f"  Imbalance ratio: {balance_ratio:.2f}:1")
print(f"  Setting undesirable_weight = {undesirable_weight:.2f} for balanced loss")

# Initialize KTO trainer
# KTO handles imbalanced data via desirable_weight and undesirable_weight
# No dataset interleaving needed - trainer will properly weight examples
print("\nInitializing KTO trainer...")
kto_trainer = KTOTrainer(
    model=model,
    args=KTOConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        learning_rate=5e-7,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        output_dir="outputs",
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        seed=42,
        report_to="none",
        remove_unused_columns=False,
        max_grad_norm=1.0,
        dataloader_num_workers=0,
        max_length=4096,
        beta=0.1,
        desirable_weight=desirable_weight,
        undesirable_weight=undesirable_weight,
    ),
    train_dataset=train_subset,
    processing_class=tokenizer,
)

print("✓ KTO trainer initialized")
print(f"Dataset: {len(train_subset)} examples")
print(f"  Desirable weight: {desirable_weight}")
print(f"  Undesirable weight: {undesirable_weight:.2f}")
print(f"Max length: 4096 tokens")
print(f"Batch config: size=2, accumulation=2, effective=4")

# Training Execution
Execute the training process with the configured trainer and monitor the training progress.

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
1.709 GB of memory reserved.


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print("=== KTO Batch Inspector ===\n")

# Let's manually create a batch and see what it looks like
from torch.utils.data import DataLoader

# Create a simple collate function to see what the trainer receives
def debug_collate_fn(batch):
    """Debug collator to inspect batch structure."""
    import torch
    
    prompts = [ex['prompt'] for ex in batch]
    completions = [ex['completion'] for ex in batch]
    labels = [ex['label'] for ex in batch]
    
    print(f"\nDEBUG COLLATE:")
    print(f"  Batch size: {len(batch)}")
    print(f"  Prompts: {[len(p) for p in prompts]}")
    print(f"  Completions: {[len(c) for c in completions]}")
    print(f"  Labels: {labels}")
    
    # Tokenize
    prompt_encodings = tokenizer(
        prompts,
        padding=True,
        truncation=True,
        max_length=4096,
        return_tensors="pt"
    )
    
    completion_encodings = tokenizer(
        completions,
        padding=True,
        truncation=True,
        max_length=4096,
        return_tensors="pt"
    )
    
    print(f"  Prompt input_ids shape: {prompt_encodings['input_ids'].shape}")
    print(f"  Completion input_ids shape: {completion_encodings['input_ids'].shape}")
    
    # Return in KTO format
    result = {
        "prompt_input_ids": prompt_encodings["input_ids"],
        "prompt_attention_mask": prompt_encodings["attention_mask"],
        "completion_input_ids": completion_encodings["input_ids"],
        "completion_attention_mask": completion_encodings["attention_mask"],
        "label": torch.tensor(labels),
    }
    
    print(f"  Returned keys: {result.keys()}")
    for k, v in result.items():
        if isinstance(v, torch.Tensor):
            print(f"    {k}: {v.shape}")
    
    return result

# Test with a single batch
print("Testing with batch size 2:")
dataloader = DataLoader(
    train_subset, 
    batch_size=2, 
    collate_fn=debug_collate_fn
)

try:
    batch = next(iter(dataloader))
    print("\n✓ Batch created successfully")
except Exception as e:
    print(f"\n✗ Error creating batch: {e}")
    import traceback
    traceback.print_exc()

# Now try training
print("\n" + "="*50)
print("Attempting training with CUDA_LAUNCH_BLOCKING=1:")
print("="*50 + "\n")

try:
    kto_trainer.train()
    print("\n✓ Training completed successfully!")
except Exception as e:
    print(f"\n✗ ERROR during training: {type(e).__name__}")
    print(f"   Message: {e}")
    
    # Try to get more info
    print("\n   Checking trainer state:")
    print(f"   - Dataset size: {len(kto_trainer.train_dataset)}")
    print(f"   - Per-device batch size: {kto_trainer.args.per_device_train_batch_size}")
    print(f"   - Tokenizer pad token: {kto_trainer.processing_class.pad_token}")
    print(f"   - Model device: {next(kto_trainer.model.parameters()).device}")


# Model Saving and Export
Save the trained model in different formats including LoRA adapters and merged model.

In [None]:
# Model Saving and Export

# Local saving
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

# Save merged model as float16 or int4
if False: # Set to True to save
    model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit")
    # model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_4bit")
    # model.save_pretrained_merged("merged_model", tokenizer, save_method = "lora")

# Save to HuggingFace Hub
if False: # Set to True to save
    model.push_to_hub_merged("your_name/model", tokenizer, save_method = "merged_16bit", token = "...")
    # save_method can be "merged_16bit", "merged_4bit", or "lora"

# Save to GGUF format (for llama.cpp)
if False: # Set to True to save
    from transformers import AutoTokenizer
    model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit")
    !git clone https://github.com/ggerganov/llama.cpp
    !cd llama.cpp && make
    !python3 llama.cpp/convert.py merged_model/ --outfile model-unsloth.gguf
    # Also supports quantization
    !./llama.cpp/quantize model-unsloth.gguf model-unsloth-Q4_K_M.gguf Q4_K_M

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"},
)

FastLanguageModel.for_inference(model)

def generate_response(message):
    print("\n" + "="*60 + "\nQUESTION:\n" + "="*60)
    print(message + "\n")
    print("-"*60 + "\nRESPONSE:\n" + "-"*60)

    messages = [{"content": message, "role": "user"}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to("cuda")

    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
    outputs = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        temperature = 0.1,
        max_new_tokens = 1024,
        use_cache = True
    )
    return outputs

# Test questions - Claudesidian vault operations
questions = [
    # Test 1: Basic content reading scenario
    "I need to review my meeting notes from yesterday. Can you help me find and read the notes?",
    
    # Test 2: Multi-step workflow with workspace context
    "I'm switching to my 'Q4-Planning' workspace. Once switched, create a summary document that lists all my project notes and their status.",
    
    # Test 3: Folder operations and organization
    "My notes are getting disorganized. Rename the 'old-drafts' folder to 'archive-2024' and then create a README.md file inside it explaining its purpose.",
    
    # Test 4: Search and cross-workspace coordination
    "Search across all my workspaces for notes containing 'roadmap' or 'strategy'. After finding them, create a unified index file that links to all results.",
    
    # Test 5: Error handling and recovery
    "I want to create a backup of an important note, but I'm not sure what the exact file path is. Help me find it and then create a backup copy.",
]

# Generate responses
for i, question in enumerate(questions, 1):
    print(f"\n\n{'='*60}\nTEST CASE {i}: Claudesidian Tool Use\n{'='*60}")
    generate_response(question)