<a href="https://colab.research.google.com/github/ProfSynapse/Toolset-Training/blob/main/kto_colab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Installation and Imports
Install required packages including unsloth and flash-attention, and import necessary libraries for the KTO finetuning process.

In [None]:
# Install required packages - automatic restart on fresh runtime
import importlib.metadata
import sys
import os

def check_version(package, required_version):
    """Check if package is installed with correct version."""
    try:
        installed = importlib.metadata.version(package)
        return installed.startswith(required_version.split('+')[0])
    except importlib.metadata.PackageNotFoundError:
        return False

# Check if we're already set up correctly
try:
    import torch
    import numpy as np
    packages_ok = (
        torch.__version__.startswith("2.4.1") and
        np.__version__.startswith("1.26") and
        check_version("transformers", "4.45.2") and
        check_version("datasets", "2.14.0") and
        check_version("trl", "0.11.4")
    )
except ImportError:
    packages_ok = False

if packages_ok:
    print("=" * 60)
    print("âœ“ All packages already installed correctly!")
    print("=" * 60)
    
    # Import everything
    from unsloth import FastLanguageModel, is_bfloat16_supported
    import os
    import re
    from typing import List, Literal, Optional
    from datasets import load_dataset
    from trl import KTOConfig, KTOTrainer
    from transformers import TrainingArguments
    import transformers
    import datasets as ds
    
    print(f"âœ“ PyTorch: {torch.__version__}")
    print(f"âœ“ NumPy: {np.__version__}")
    print(f"âœ“ Transformers: {transformers.__version__}")
    print(f"âœ“ TRL: {importlib.metadata.version('trl')}")
    print(f"âœ“ Datasets: {ds.__version__}")
    print("\n" + "=" * 60)
    print("READY TO PROCEED!")
    print("=" * 60)

else:
    print("=" * 60)
    print("INSTALLING PACKAGES")
    print("=" * 60)
    print("Note: Runtime will auto-restart after installation")
    print("Just re-run this cell after restart - it will skip installation\n")
    
    # Install PyTorch with numpy constraint
    print("[1/6] Installing PyTorch 2.4.1 + CUDA 12.1 with numpy<2.0...")
    !pip install -q torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 "numpy>=1.24.0,<2.0" --index-url https://download.pytorch.org/whl/cu121
    
    # Install core ML packages
    print("[2/6] Installing core ML libraries...")
    !pip install -q transformers==4.45.2 datasets==2.14.0 accelerate==0.27.0 bitsandbytes==0.43.0 peft==0.7.0 trl==0.11.4
    
    # Install utilities
    print("[3/6] Installing utilities...")
    !pip install -q pandas==2.0.0 tqdm==4.65.0 huggingface-hub==0.20.0
    
    # Install unsloth
    print("[4/6] Installing unsloth...")
    !pip install -q "unsloth[cu121-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
    
    # Force numpy back to 1.26.x
    print("[5/6] Ensuring numpy compatibility...")
    !pip install -q --force-reinstall "numpy>=1.24.0,<2.0"
    
    # Install Flash Attention
    print("[6/6] Installing Flash Attention 2 (this may take 2-5 minutes)...")
    !pip install -q ninja packaging
    
    # Try to import torch to check GPU capability
    try:
        import torch
        device_capability = torch.cuda.get_device_capability()[0]
        if device_capability >= 8:
            !pip install -q flash-attn==2.5.9 --no-build-isolation
    except:
        print("Note: Flash Attention will be installed after restart")
    
    print("\n" + "=" * 60)
    print("âœ“ INSTALLATION COMPLETE")
    print("=" * 60)
    print("\nðŸ”„ Auto-restarting runtime in 3 seconds...")
    print("After restart, just re-run this cell to continue!\n")
    
    import time
    time.sleep(3)
    
    # Auto-restart the runtime
    os.kill(os.getpid(), 9)

# Model Loading and Configuration
Load the pre-trained model and tokenizer using FastLanguageModel, and configure basic parameters like sequence length and quantization settings.

In [None]:
# Model Loading and Configuration

# Set basic parameters
max_seq_length = 4096  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.


# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gpt-oss-20b-unsloth-bnb-4bit",  # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.3
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Add proper chat template if missing
if tokenizer.chat_template is None:
    DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
    tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

# Dataset Preparation and Processing
Load the combined Claude + Copilot synthetic dataset from Hugging Face Hub. This dataset file (`syngen_tools_11.14.25.jsonl`) contains 4,652 examples combining both Claude (3,214) and Copilot (1,438) tool use conversations. The dataset maintains balanced True/False examples for effective KTO training.

In [None]:
# Dataset Preparation and Processing

# Load the combined Claude + Copilot dataset from HuggingFace
raw_datasets = load_dataset(
    "professorsynapse/claudesidian-synthetic-dataset",
    data_files="syngen_tools_11.14.25.jsonl"
)
train_dataset = raw_datasets["train"]

# Convert ChatML to KTO format
def prepare_kto_format(example):
    """Convert ChatML format to KTO format."""
    conversations = example["conversations"]
    user_msgs = [msg for msg in conversations if msg["role"] == "user"]
    assistant_msgs = [msg for msg in conversations if msg["role"] == "assistant"]
    
    if not user_msgs or not assistant_msgs:
        return None
    
    return {
        "prompt": user_msgs[0]["content"],
        "completion": assistant_msgs[0]["content"],
        "label": example["label"]
    }

# Process dataset
processed_dataset = [prepare_kto_format(ex) for ex in train_dataset if prepare_kto_format(ex)]

# Verify distribution
desirable = sum(1 for ex in processed_dataset if ex["label"])
undesirable = len(processed_dataset) - desirable

print(f"Dataset: {len(processed_dataset)} examples ({desirable} desirable, {undesirable} undesirable)")
print(f"Ratio: {desirable/undesirable:.2f}:1 (desirable:undesirable)")

# Create HuggingFace dataset
from datasets import Dataset as HFDataset
train_subset = HFDataset.from_dict({
    "prompt": [ex["prompt"] for ex in processed_dataset],
    "completion": [ex["completion"] for ex in processed_dataset],
    "label": [ex["label"] for ex in processed_dataset],
})

print(f"Ready for training: {len(train_subset)} examples")

# Model Training Setup
Configure LoRA adapters and initialize the KTO trainer with optimized hyperparameters for GPT-OSS 20B.

In [None]:
# Model Training Setup - Configure LoRA and KTO Trainer

# For Mistral-7B: Use r=64, alpha=128
# For GPT-OSS-20B: Use r=128, alpha=256

# Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=128,  # LoRA rank - GPT-OSS 20B configuration (48â†’128)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=256,  # LoRA alpha - GPT-OSS 20B configuration (96â†’256)
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# KTO Training Configuration
from trl import KTOConfig, KTOTrainer

training_args = KTOConfig(
    output_dir="./kto_output_gpt_oss_20b",
    
    # Batch size optimization
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,  # Effective batch size = 32
    
    # KTO-specific parameters
    beta=0.05,  # KTO beta parameter for GPT-OSS 20B
    desirable_weight=1.0,
    undesirable_weight=1.0,
    
    # Learning rate
    learning_rate=5.0e-7,
    max_grad_norm=1.0,
    
    # Sequence lengths
    max_length=4096,
    max_prompt_length=2048,
    
    # Memory optimizations
    gradient_checkpointing=True,
    optim="adamw_8bit",
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    
    # Training schedule
    max_steps=1250,  # 2 epochs for balanced dataset
    warmup_steps=125,  # 10% of max_steps (0.10 warmup ratio)
    warmup_ratio=0.10,  # Changed from 0.06 to 0.10
    
    # Logging and saving
    logging_steps=10,
    save_steps=250,
    save_total_limit=2,
    
    # Performance
    dataloader_num_workers=2,
    group_by_length=False,
)

# Initialize KTO Trainer
kto_trainer = KTOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_subset,
)

In [None]:
print("âœ“ KTO trainer initialized")
print(f"Dataset: {len(train_subset)} examples")
print(f"Max length: 4096 tokens")
print(f"Batch config: size=4, accumulation=8, effective=32")
print(f"\nGPT-OSS 20B Parameters:")
print(f"  Learning rate: 5.0e-7")
print(f"  Warmup steps: 125 (warmup_ratio: 0.10)")
print(f"  Max steps: 1250 (2 epochs)")
print(f"  LoRA: r=128, alpha=256")
print(f"  Beta: 0.05")

# Training Execution
Execute the training process with the configured trainer and monitor the training progress.

In [None]:
# Training Execution

# Enable CUDA error debugging
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Start training
print("Starting KTO training...")
print("="*50)

try:
    trainer_output = kto_trainer.train()
    print("\nâœ“ Training completed successfully!")
    print(f"Final loss: {trainer_output.training_loss:.4f}")
except Exception as e:
    print(f"\nâœ— Training failed: {type(e).__name__}")
    print(f"Error: {e}")
    print("\nIf CUDA error persists, check:")
    print("  1. Dataset has mixed True/False labels")
    print("  2. Batch size is compatible with dataset size")
    print("  3. GPU memory is sufficient")

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Model Saving to Hugging Face
Save your trained model directly to Hugging Face Hub in both standard and GGUF formats. Simply provide your account name, model name, and HuggingFace token.

In [None]:
# Model Saving to Hugging Face

# ============================================================================
# CONFIGURATION: Update these values with your HuggingFace details
# ============================================================================
HF_USERNAME = "your_username"  # e.g., "professorsynapse"
MODEL_NAME = "your_model_name"  # e.g., "claudesidian-gpt-oss-20b-kto"
HF_TOKEN = "hf_..."  # Your HuggingFace write token from https://huggingface.co/settings/tokens

# ============================================================================
# Choose save method (recommended: "merged_16bit" for GGUF conversion)
# ============================================================================
# Options:
#   - "merged_16bit": Full precision merged model (required for GGUF)
#   - "merged_4bit": Quantized 4-bit merged model (smaller size)
#   - "lora": Save only LoRA adapters (smallest size, requires base model to use)
SAVE_METHOD = "merged_16bit"

# GGUF Quantization options (multiple can be True)
CREATE_GGUF = True  # Set to True to create GGUF versions
GGUF_QUANTIZATIONS = ["Q4_K_M", "Q5_K_M", "Q8_0"]  # Recommended quantization levels

# ============================================================================
# Step 1: Upload standard model to HuggingFace
# ============================================================================
print(f"Uploading model to: {HF_USERNAME}/{MODEL_NAME}")
print(f"Save method: {SAVE_METHOD}")
print("=" * 60)

try:
    model.push_to_hub_merged(
        f"{HF_USERNAME}/{MODEL_NAME}",
        tokenizer,
        save_method=SAVE_METHOD,
        token=HF_TOKEN
    )
    print("\nâœ“ Model successfully uploaded to Hugging Face!")
    print(f"\nView your model at: https://huggingface.co/{HF_USERNAME}/{MODEL_NAME}")
except Exception as e:
    print(f"\nâœ— Upload failed: {e}")
    print("\nTroubleshooting:")
    print("  1. Verify your HF_TOKEN has write permissions")
    print("  2. Check that HF_USERNAME is correct")
    print("  3. Ensure the model name is valid (alphanumeric and hyphens only)")
    raise

# ============================================================================
# Step 2: Create and upload GGUF versions (for llama.cpp)
# ============================================================================
if CREATE_GGUF:
    print("\n" + "=" * 60)
    print("Creating GGUF versions for llama.cpp")
    print("=" * 60)
    
    # Save merged model locally for GGUF conversion
    print("\n[1/4] Saving merged model locally...")
    model.save_pretrained_merged("merged_model", tokenizer, save_method="merged_16bit")
    
    # Clone llama.cpp if not already present
    print("\n[2/4] Setting up llama.cpp...")
    !git clone https://github.com/ggerganov/llama.cpp 2>/dev/null || echo "llama.cpp already exists"
    !cd llama.cpp && make -j 2>/dev/null || echo "llama.cpp already built"
    
    # Convert to GGUF base format
    print("\n[3/4] Converting to GGUF base format...")
    !python llama.cpp/convert_hf_to_gguf.py merged_model/ --outfile model-unsloth.gguf --outtype f16
    
    # Create quantized versions
    print("\n[4/4] Creating quantized versions...")
    for quant in GGUF_QUANTIZATIONS:
        output_file = f"model-unsloth-{quant}.gguf"
        print(f"  - Creating {quant} quantization...")
        !./llama.cpp/llama-quantize model-unsloth.gguf {output_file} {quant}
    
    # Upload GGUF files to HuggingFace
    print("\n" + "=" * 60)
    print("Uploading GGUF files to Hugging Face...")
    print("=" * 60)
    
    from huggingface_hub import HfApi
    api = HfApi()
    
    # Upload base GGUF
    print("\nUploading base GGUF (f16)...")
    api.upload_file(
        path_or_fileobj="model-unsloth.gguf",
        path_in_repo="model-unsloth-f16.gguf",
        repo_id=f"{HF_USERNAME}/{MODEL_NAME}",
        repo_type="model",
        token=HF_TOKEN
    )
    
    # Upload quantized versions
    for quant in GGUF_QUANTIZATIONS:
        output_file = f"model-unsloth-{quant}.gguf"
        print(f"Uploading {quant} quantization...")
        api.upload_file(
            path_or_fileobj=output_file,
            path_in_repo=output_file,
            repo_id=f"{HF_USERNAME}/{MODEL_NAME}",
            repo_type="model",
            token=HF_TOKEN
        )
    
    print("\nâœ“ All GGUF files uploaded successfully!")
    print(f"\nGGUF files available at: https://huggingface.co/{HF_USERNAME}/{MODEL_NAME}/tree/main")
    print(f"\nQuantization levels uploaded:")
    print(f"  - f16 (base, highest quality)")
    for quant in GGUF_QUANTIZATIONS:
        print(f"  - {quant}")
    
    # Cleanup
    print("\nCleaning up temporary files...")
    !rm -rf merged_model model-unsloth*.gguf
    
print("\n" + "=" * 60)
print("âœ“ All uploads complete!")
print("=" * 60)

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"},
)

FastLanguageModel.for_inference(model)

def generate_response(message):
    print("\n" + "="*60 + "\nQUESTION:\n" + "="*60)
    print(message + "\n")
    print("-"*60 + "\nRESPONSE:\n" + "-"*60)

    messages = [{"content": message, "role": "user"}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to("cuda")

    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
    outputs = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        temperature = 0.1,
        max_new_tokens = 1024,
        use_cache = True
    )
    return outputs

# Test questions - Claudesidian vault operations
questions = [
    # Test 1: Basic content reading scenario
    "I need to review my meeting notes from yesterday. Can you help me find and read the notes?",
    
    # Test 2: Multi-step workflow with workspace context
    "I'm switching to my 'Q4-Planning' workspace. Once switched, create a summary document that lists all my project notes and their status.",
    
    # Test 3: Folder operations and organization
    "My notes are getting disorganized. Rename the 'old-drafts' folder to 'archive-2024' and then create a README.md file inside it explaining its purpose.",
    
    # Test 4: Search and cross-workspace coordination
    "Search across all my workspaces for notes containing 'roadmap' or 'strategy'. After finding them, create a unified index file that links to all results.",
    
    # Test 5: Error handling and recovery
    "I want to create a backup of an important note, but I'm not sure what the exact file path is. Help me find it and then create a backup copy.",
]

# Generate responses
for i, question in enumerate(questions, 1):
    print(f"\n\n{'='*60}\nTEST CASE {i}: Claudesidian Tool Use\n{'='*60}")
    generate_response(question)