# Tool-Calling Fine-Tuning with SFT

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ProfSynapse/Toolset-Training/blob/main/Trainers/notebooks/sft_colab_tool_calling.ipynb)

Train models to use the **Claudesidian-MCP toolset** for Obsidian vault operations.

**Method:** SFT (Supervised Fine-Tuning) - Direct supervision for learning tool-calling behavior

**Recommended GPU:**
- 7B models: T4 (15GB) - Free Colab tier
- 13B models: A100 (40GB) - Colab Pro
- 70B models: A100 (80GB) - Colab Pro+

## 1. Installation

Install Unsloth and dependencies. This takes ~2 minutes.

In [None]:
# Install Unsloth for faster training
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
# Install training dependencies
%%capture
!pip install -U "transformers>=4.45.0"
!pip install "datasets==4.3.0"
!pip install -U accelerate bitsandbytes
!pip install -U trl peft xformers triton

## 2. Mount Google Drive (Optional)

Save checkpoints to Google Drive so they persist if runtime disconnects.

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

# Create output directory
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/SFT_Training"
os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

print(f"‚úì Google Drive mounted")
print(f"‚úì Checkpoints will be saved to: {DRIVE_OUTPUT_DIR}")

## 3. HuggingFace Credentials

Add your HF token to Colab secrets:
1. Click the üîë key icon in the left sidebar
2. Add new secret: `HF_TOKEN`
3. Get token from https://huggingface.co/settings/tokens

In [None]:
import os
from google.colab import userdata
from huggingface_hub import HfApi

# Get token from Colab secrets
HF_TOKEN = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = HF_TOKEN

# Get your HuggingFace username automatically
api = HfApi()
hf_user = api.whoami(token=HF_TOKEN)["name"]

print(f"‚úì HuggingFace token loaded")
print(f"‚úì Username: {hf_user}")

## 4. Model Configuration

In [None]:
# Model options:
#   7B:  "unsloth/mistral-7b-v0.3-bnb-4bit" (recommended for T4)
#   8B:  "unsloth/llama-3.1-8b-instruct-bnb-4bit"
#   13B: "unsloth/llama-2-13b-bnb-4bit" (requires A100)

MODEL_NAME = "unsloth/mistral-7b-v0.3-bnb-4bit"
MAX_SEQ_LENGTH = 2048

# Dataset
DATASET_NAME = "professorsynapse/claudesidian-synthetic-dataset"
DATASET_FILE = "syngen_tools_sft_pingpong_11.18.25.jsonl"

# Output model name
OUTPUT_MODEL_NAME = "nexus-tools-sft"

print(f"Model: {MODEL_NAME}")
print(f"Dataset: {DATASET_NAME}/{DATASET_FILE}")
print(f"\nOutput will be uploaded to:")
print(f"  - LoRA: {hf_user}/{OUTPUT_MODEL_NAME}")
print(f"  - Merged: {hf_user}/{OUTPUT_MODEL_NAME}-merged")

## 5. Load Model and Tokenizer

In [None]:
from unsloth import FastLanguageModel
import torch

# Check GPU
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Available VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print()

In [None]:
# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=True,
    token=HF_TOKEN,
)

print("‚úì Model loaded successfully")

## 6. Apply LoRA Adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print("‚úì LoRA adapters applied")

## 7. Load and Prepare Dataset

In [None]:
from datasets import load_dataset

# Load dataset
print(f"Loading dataset: {DATASET_NAME}")
dataset = load_dataset(
    DATASET_NAME,
    data_files=DATASET_FILE,
    split="train"
)

print(f"‚úì Loaded {len(dataset)} examples")
print(f"\nSample:")
print(dataset[0])

# Set chat template if not present
if tokenizer.chat_template is None:
    print("\n‚ö†Ô∏è  Tokenizer has no chat template, setting ChatML template...")
    tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + message['content'] + '<|im_end|>\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|im_start|>assistant\\n' + message['content'] + '<|im_end|>\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}"
    print("‚úì Chat template set to ChatML format")
else:
    print("\n‚úì Tokenizer already has chat template")

In [None]:
# Format dataset for SFT training
def format_chat_template(example):
    """Convert conversations to tokenizer's chat template."""
    text = tokenizer.apply_chat_template(
        example["conversations"],
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

dataset = dataset.map(
    format_chat_template,
    remove_columns=dataset.column_names,
    desc="Formatting dataset"
)

print("‚úì Dataset formatted for training")
print(f"\nFormatted example (first 500 chars):")
print(dataset[0]["text"][:500])

## 8. Training Configuration

In [None]:
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported
from datetime import datetime

# Create timestamped output directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{DRIVE_OUTPUT_DIR}/{timestamp}"

# Training configuration
training_args = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_grad_norm=1.0,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    num_train_epochs=3,
    max_seq_length=MAX_SEQ_LENGTH,
    packing=False,
    dataset_text_field="text",
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",
    gradient_checkpointing=True,
    logging_steps=5,
    save_steps=100,
    save_total_limit=3,
    seed=42,
    report_to="none",
)

print("‚úì Training configuration ready")
print(f"  Output directory: {output_dir}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Epochs: {training_args.num_train_epochs}")

## 9. Initialize Trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=training_args,
)

print("‚úì Trainer initialized")

## 10. Train!

This will take ~45 minutes for 7B models on T4 GPU.

In [None]:
# Check GPU memory
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
print()

# Start training
print("=" * 60)
print("STARTING TRAINING")
print("=" * 60)
print()

trainer_stats = trainer.train()

print()
print("=" * 60)
print("TRAINING COMPLETED")
print("=" * 60)

## 11. Upload to HuggingFace

Upload LoRA adapters, merged model, and GGUF quantizations.

In [None]:
# Upload LoRA adapters
model.push_to_hub(
    f"{hf_user}/{OUTPUT_MODEL_NAME}",
    token=HF_TOKEN,
    private=False
)
tokenizer.push_to_hub(
    f"{hf_user}/{OUTPUT_MODEL_NAME}",
    token=HF_TOKEN,
    private=False
)

print(f"‚úì LoRA adapters uploaded to HuggingFace")
print(f"  View at: https://huggingface.co/{hf_user}/{OUTPUT_MODEL_NAME}")

In [None]:
# Upload merged 16-bit model
print("Merging LoRA weights into base model (16-bit)...")
print("This will take ~5 minutes...")

model.push_to_hub_merged(
    f"{hf_user}/{OUTPUT_MODEL_NAME}-merged",
    tokenizer,
    save_method="merged_16bit",
    token=HF_TOKEN,
    private=False
)

print(f"‚úì Merged model uploaded to HuggingFace")
print(f"  View at: https://huggingface.co/{hf_user}/{OUTPUT_MODEL_NAME}-merged")

In [None]:
# Create GGUF quantizations
quantization_methods = ["q4_k_m", "q5_k_m", "q8_0"]

print("Creating GGUF quantizations...")
print(f"This will create {len(quantization_methods)} versions")
print()

model.push_to_hub_gguf(
    f"{hf_user}/{OUTPUT_MODEL_NAME}",
    tokenizer,
    quantization_method=quantization_methods,
    token=HF_TOKEN,
)

print()
print("‚úì GGUF quantizations created and uploaded!")
print(f"  View at: https://huggingface.co/{hf_user}/{OUTPUT_MODEL_NAME}")

## Done!

Your model has been trained and uploaded to HuggingFace.

**Next steps:**
1. Download GGUF files for Ollama/LM Studio
2. Run the Evaluator to test tool-calling accuracy
3. Deploy to production

**Using your model in LM Studio:**
1. Go to "Discover" tab
2. Search for your username
3. Download Q4_K_M or Q5_K_M version
4. Load and test!