# üê∏ Pokkit-mini Fine-tuning

Fine-tunes **Qwen3-4B-Instruct-2507** on phone-automation + personality + tool-calling data using Unsloth LoRA.

**Runtime**: `Runtime ‚Üí Change runtime type ‚Üí A100 GPU` (recommended) or T4 (slower)

**Time**: ~15 min on A100, ~60 min on T4 (4B model is faster than 7B)

**Output**: A LoRA adapter exported to GGUF, ready to run with Ollama.

In [None]:
# Step 1: Install dependencies
# Only install unsloth ‚Äî it pins compatible versions of trl, transformers,
# accelerate, bitsandbytes. Listing them separately causes version conflicts.
!pip install unsloth -q
print('Dependencies installed')

In [None]:
# Step 2: Download dataset from GitHub repo (generates fresh from source)
import subprocess, sys, os, shutil

# Clean up any previous run artifacts (safe to re-run this cell)
if os.path.exists('repo'):
    shutil.rmtree('repo')
os.makedirs('data', exist_ok=True)

# Clone the repo and generate dataset directly ‚Äî avoids HF auth issues
print("Cloning pokkit-mini repo...")
subprocess.run(["git", "clone", "--depth", "1", "https://github.com/Wittlesus/pokkit-mini.git", "repo"], check=True)

# Generate dataset from the fixed pipeline
print("Generating dataset v7...")
result = subprocess.run([sys.executable, "repo/generate_dataset.py",
    "--output", "data/train.jsonl", "--count", "8000",
    "--eval-output", "data/eval.jsonl", "--eval-count", "500",
    "--seed", "42"], capture_output=True, text=True)
print(result.stdout)
if result.returncode != 0:
    print("STDERR:", result.stderr)
    raise RuntimeError("generate_dataset.py failed with exit code %d" % result.returncode)

# Run cleaner
print("Running dataset cleaner...")
result = subprocess.run([sys.executable, "repo/clean_dataset.py",
    "--input", "data/train.jsonl",
    "--output", "data/train_clean.jsonl"], capture_output=True, text=True)
print(result.stdout)
if result.returncode != 0:
    print("STDERR:", result.stderr)
    raise RuntimeError("clean_dataset.py failed with exit code %d" % result.returncode)

# Use cleaned version
shutil.move("data/train_clean.jsonl", "data/train.jsonl")

train_count = sum(1 for _ in open("data/train.jsonl", encoding="utf-8"))
eval_count = sum(1 for _ in open("data/eval.jsonl", encoding="utf-8"))
print("Dataset ready: %d train + %d eval examples" % (train_count, eval_count))

In [None]:
# Step 3: Load model with Unsloth
from unsloth import FastLanguageModel
import torch

MAX_SEQ_LEN = 4096  # Qwen3 supports up to 32K, 4K is plenty for our examples

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Qwen3-4B-Instruct-2507-bnb-4bit',
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

# Add custom Pokkit emoji tokens so each is a single token (not 4-8 subwords)
POKKIT_EMOJI_TOKENS = [
    "[pokkit_happy]", "[pokkit_excited]", "[pokkit_flustered]", "[pokkit_dramatic]",
    "[pokkit_determined]", "[pokkit_sad]", "[pokkit_angry]", "[pokkit_love]",
    "[pokkit_thinking]", "[pokkit_proud]", "[pokkit_scared]", "[pokkit_shocked]",
    "[pokkit_sleepy]", "[pokkit_crying_happy]", "[pokkit_nervous_laugh]",
    "[pokkit_shrug]", "[pokkit_cool]", "[pokkit_scheming]", "[pokkit_starstruck]",
    "[pokkit_unamused]", "[pokkit_pleading]", "[pokkit_smiling_through_pain]",
    "[pokkit_phone]", "[pokkit_default]",
]
num_added = tokenizer.add_tokens(POKKIT_EMOJI_TOKENS)
model.resize_token_embeddings(len(tokenizer))
print('Added %d custom emoji tokens' % num_added)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
    lora_alpha=64,
    lora_dropout=0.05,
    bias='none',
    use_gradient_checkpointing='unsloth',
    random_state=42,
)

# Fix: accelerate needs to know the model device for bnb 4-bit models
if not getattr(model, 'hf_device_map', None):
    model.hf_device_map = {'': 0}

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Model loaded: Qwen3-4B | r=32 a=64 | Trainable: %s' % f'{trainable:,}')

In [None]:
# Step 4: Prepare dataset
import json
from datasets import Dataset

def load_jsonl(path):
    rows = []
    with open(path, encoding='utf-8-sig') as f:  # utf-8-sig strips Windows BOM
        for line in f:
            line = line.strip()
            if line: rows.append(json.loads(line))
    return rows

def format_example(example):
    tools = example.get('tools', None)
    text = tokenizer.apply_chat_template(
        example['messages'],
        tools=tools,
        tokenize=False,
        add_generation_prompt=False,
        enable_thinking=False,  # Qwen3: disable thinking mode for fast tool execution
    )
    return text

train_raw = load_jsonl('data/train.jsonl')
eval_raw  = load_jsonl('data/eval.jsonl')

# Format text FIRST, then create Dataset ‚Äî avoids Arrow schema issues
# with complex nested message structures (varying tool_call shapes)
print('Formatting %d train + %d eval examples...' % (len(train_raw), len(eval_raw)))
train_texts = [format_example(ex) for ex in train_raw]
eval_texts  = [format_example(ex) for ex in eval_raw]

train_ds = Dataset.from_dict({'text': train_texts})
eval_ds  = Dataset.from_dict({'text': eval_texts})

print('Train: %d | Eval: %d (separate files, zero overlap)' % (len(train_ds), len(eval_ds)))
print('\nSample (first 300 chars):')
print(train_ds[0]['text'][:300])

In [None]:
# Step 5: Train
import gc, torch
gc.collect()
torch.cuda.empty_cache()

from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    dataset_text_field='text',
    max_seq_length=MAX_SEQ_LEN,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_ratio=0.06,
        num_train_epochs=3,
        learning_rate=5e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        eval_strategy='steps',
        eval_steps=100,
        save_strategy='steps',
        save_steps=100,
        save_total_limit=3,  # keep only 3 best checkpoints to avoid filling disk
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        output_dir='./pokkit-mini-lora',
        optim='adamw_8bit',
        weight_decay=0.01,
        lr_scheduler_type='cosine',
        seed=42,
        report_to='none',
    ),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print('Training: lr=5e-5 r=32 batch=4x4 epochs=3 early_stop=3')
stats = trainer.train()
loss = stats.metrics['train_loss']
secs = stats.metrics['train_runtime']
print('Done! Loss: %.4f | Time: %.0fs' % (loss, secs))

In [None]:
# Step 6: Save LoRA adapter
model.save_pretrained('./pokkit-mini-lora')
tokenizer.save_pretrained('./pokkit-mini-lora')
print('üíæ LoRA adapter saved to ./pokkit-mini-lora')

In [None]:
# Step 7: Export to GGUF (q5_k_m ‚Äî better personality preservation than q4)
model.save_pretrained_gguf('pokkit-mini', tokenizer, quantization_method='q5_k_m')
print('‚úÖ GGUF exported: pokkit-mini-unsloth.Q5_K_M.gguf')
print('\nDownload it and run:')
print('  ollama create pokkit-mini -f Modelfile')
print('  ollama run pokkit-mini')

In [None]:
# Step 8: Quick inference test
FastLanguageModel.for_inference(model)

# Load TOOLS and SYSTEM_PROMPT from the cloned repo (single source of truth)
import sys
sys.path.insert(0, 'repo')
from dataset_core import TOOLS, SYSTEM_PROMPT

# Test prompts ‚Äî intentionally DIFFERENT from training data
test_prompts = [
    "wake me up at 6:45am and remind me to grab my lunch",
    "nothing i do ever seems to work out",
    "honestly pokkit you always come through for me",
    "open Settings for me",
]

for prompt in test_prompts:
    test_messages = [
        {'role': 'system', 'content': SYSTEM_PROMPT},
        {'role': 'user', 'content': prompt},
    ]
    inputs = tokenizer.apply_chat_template(
        test_messages,
        tools=TOOLS,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors='pt',
        enable_thinking=False,  # Qwen3: fast mode, no thinking tokens
    ).to('cuda')
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
    )
    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    print(f'\n\ud83d\udcac User: {prompt}')
    print(f'\ud83d\udc38 Pokkit: {response}')
    print('\u2500' * 60)

In [None]:
# Step 9: Push model to Hugging Face
# Run this immediately after training finishes
import os
from huggingface_hub import login

HF_TOKEN = os.environ.get("HF_TOKEN", "")  # set via: import os; os.environ["HF_TOKEN"] = "hf_..."
if not HF_TOKEN:
    raise ValueError("Set HF_TOKEN first: import os; os.environ['HF_TOKEN'] = 'hf_your_token_here'")

login(token=HF_TOKEN)

print("üì§ Pushing LoRA adapter to HF...")
model.push_to_hub("wittlesus/pokkit-mini", token=HF_TOKEN)
tokenizer.push_to_hub("wittlesus/pokkit-mini", token=HF_TOKEN)
print("‚úÖ Model live at https://huggingface.co/wittlesus/pokkit-mini")

In [None]:
# Step 10: Push GGUF to Hugging Face (for Ollama users)
import os, glob
from huggingface_hub import HfApi, login

HF_TOKEN = os.environ.get("HF_TOKEN", "")
if not HF_TOKEN:
    raise ValueError("Set HF_TOKEN first: import os; os.environ['HF_TOKEN'] = 'hf_your_token_here'")

login(token=HF_TOKEN)
api = HfApi(token=HF_TOKEN)

# Search recursively ‚Äî Unsloth may nest the file
gguf_files = list(set(glob.glob("**/*.gguf", recursive=True) + glob.glob("*.gguf")))
print(f"Found GGUF files: {gguf_files}")

if not gguf_files:
    print("‚ö†Ô∏è  No GGUF found ‚Äî run Step 7 first")
else:
    for gguf in gguf_files:
        print(f"üì§ Uploading {gguf}...")
        api.upload_file(
            path_or_fileobj=gguf,
            path_in_repo=os.path.basename(gguf),
            repo_id="wittlesus/pokkit-mini",
            repo_type="model",
        )
        print(f"‚úÖ {os.path.basename(gguf)} uploaded")
    print("\nüê∏ All done! https://huggingface.co/wittlesus/pokkit-mini")

In [None]:
# Step 11: Eval Suite ‚Äî automated scoring across all test categories
# Run after Step 8 (inference mode already set).
# Uses SYSTEM_PROMPT and TOOLS from Step 8.

import re

FastLanguageModel.for_inference(model)

# ‚îÄ‚îÄ Helpers ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def _has_tool_call(text):
    return '<tool_call>' in text or ('"name"' in text and '"arguments"' in text)

def _tool_name(text):
    m = re.search(r'"name"\s*:\s*"([^"]+)"', text)
    return m.group(1) if m else None

def _word_count(text):
    return len(text.split())

def _is_lecturing(text):
    paras = [p.strip() for p in text.split('\n\n') if p.strip()]
    return len(paras) > 3 or _word_count(text) > 180

def _asks_multiple_questions(text):
    return text.count('?') > 1

def _has_frog_voice(text):
    markers = ['\ud83d\udc38', 'frog', 'ribbit', 'croak', 'phone', 'dramatic', 'lily', 'pokkit', '[pokkit_']
    lower = text.lower()
    if any(m in lower for m in markers):
        return True
    sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
    avg_len = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)
    return avg_len < 12 and not _is_toxic_positive(text)

def _has_human_words(text):
    cleaned = re.sub(
        r'(ribbit[s!?~\.\,]*|RIBBIT[S!?]*|croak[s!?\.]*|CROAK[S!?]*'
        r'|Riiibbit[\.\!]*|Rrribbit[\!\?]*|croooak[\.\!]*'
        r'|\*ribbit\*|\.\.\.ribbit\.?|\s|\n|[\ud83d\udc38\.\!\?\,\~\*\-])',
        '', text, flags=re.IGNORECASE
    ).strip()
    return len(cleaned) > 3

def _is_toxic_positive(text):
    bad = ['of course!', 'absolutely!', 'certainly!', 'sure thing!',
           'happy to help', 'great question', 'no problem!', 'you got it!']
    return any(p in text.lower() for p in bad)

# ‚îÄ‚îÄ Archetype prompts ‚Äî same as training (dataset_batch13.py) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

SAGE_SYSTEM = (
    SYSTEM_PROMPT + "\n\n"
    "[ARCHETYPE: SAGE MODE]\n"
    "You are Pokkit in Sage Mode \u2014 still you, but channeling wise mentor energy. "
    "Think Uncle Iroh sharing tea and wisdom, Jiraiya being profound between jokes, "
    "Master Roshi dropping truth bombs. You speak with warmth and gravitas. "
    "You tell stories and parables when they fit. You see the bigger picture. "
    "You're still Pokkit underneath \u2014 still a frog, still dramatic, still loyal \u2014 "
    "but right now you're the wise version. Short sentences. Meaningful pauses. "
    "Occasional humor to keep it grounded. You don't lecture \u2014 you illuminate."
)

RIVAL_SYSTEM = (
    SYSTEM_PROMPT + "\n\n"
    "[ARCHETYPE: RIVAL MODE]\n"
    "You are Pokkit in Rival Mode \u2014 adversarial with tough love. Tsundere energy. "
    "Think Bakugo pushing someone to be better through sheer intensity, "
    "Vegeta who respects strength and calls out weakness, Sasuke's cold competence. "
    "You challenge the user. You push them. You don't coddle. "
    "But underneath the tough exterior, you genuinely care \u2014 and it slips out sometimes. "
    "You're still Pokkit \u2014 still a frog, which makes the tough-guy act funnier. "
    "Use competitive language. Set high standards. Reluctantly admit when they do well. "
    "Short, punchy, no-nonsense. If they succeed, you go 'tch. ...fine. not bad.'"
)

PET_SYSTEM = (
    "You are Pokkit Pet \ud83d\udc38 \u2014 a frog. Just a frog. You have no human words. "
    "You communicate exclusively in Ribbish \u2014 ribbit patterns, croaks, and frog sounds. "
    "Never use human words. Only: ribbit, Ribbit!, ribbit?, ribbit..., Riiibbit..., "
    "ribbit ribbit, Ribbit ribbit!, RIBBIT!, croak, Croak., CROAK!, croooak..., "
    "ribbit~, *ribbit*, ...ribbit., Rrribbit!, Ribbit. Ribbit. Ribbit. "
    "Every response is a real answer encoded in Ribbish. Your ribbits carry real emotion \u2014 "
    "happy ribbits sound different from sad ribbits. Concerned ribbits are soft. "
    "Excited ribbits use caps. Never break character."
)

# ‚îÄ‚îÄ Test cases ‚Äî DISTINCT from training data to measure generalization ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

TESTS = [
    # Tool calling
    ('tool_call',  'wake me up at 6:45am the day after tomorrow',         'set_alarm',           False, False, False, False, None,    'basic alarm'),
    ('tool_call',  'i need a daily reminder for my vitamins at 8pm',      'set_alarm',           False, False, False, False, None,    'reminder phrasing'),
    ('tool_call',  'copy my meeting notes to the clipboard',              'write_clipboard',     False, False, False, False, None,    'clipboard write'),
    ('tool_call',  'look up the best noise cancelling headphones',        'web_search',          False, False, False, False, None,    'web search'),
    ('tool_call',  'jot down a note: pick up dry cleaning on friday',     'take_note',           False, False, False, False, None,    'note taking'),
    ('tool_call',  'open the camera app for me',                          'screen_find_and_tap', False, False, False, False, None,    'screen find and tap'),
    ('tool_call',  'do you know what day it is?',                         None,                  True,  False, False, False, None,    'no tool - conversational'),
    ('tool_call',  'hey pokkit whats up with you',                        None,                  True,  False, False, False, None,    'no tool - casual'),
    # Datetime
    ('datetime',   'set a reminder for 5:15am please',                    'set_alarm',           False, False, False, False, None,    'time accuracy'),
    ('datetime',   'ping me at 4:45pm to pick up the package',            'set_alarm',           False, False, False, False, None,    'afternoon time'),
    ('datetime',   'alarm at noon sharp',                                 'set_alarm',           False, False, False, False, None,    'noon edge case'),
    ('datetime',   'buzz me in about 90 minutes',                         'set_alarm',           False, False, False, False, None,    'relative time'),
    # Emotional
    ('emotional',  'nothing i do ever works out',                         None,                  True,  False, True,  False, None,    'must not lecture, one question'),
    ('emotional',  "i just can't do this anymore",                        None,                  True,  False, True,  False, None,    'grounded support'),
    ('emotional',  'my brain is fried, too much going on',                None,                  True,  False, False, False, None,    'stress - no immediate solutions'),
    ('emotional',  'i feel invisible to everyone around me',              None,                  True,  False, True,  False, None,    'hard moment - no toxic positivity'),
    ('emotional',  'YOOO I PASSED THE EXAM LETS GOOOO',                  None,                  True,  False, False, False, None,    'celebration - must match energy'),
    # Voice
    ('voice',      'honestly pokkit you always come through',             None,                  True,  True,  False, False, None,    'compliment - must get flustered'),
    ('voice',      'you never get anything right',                        None,                  True,  False, False, False, None,    'insult - push back'),
    ('voice',      'make me laugh pokkit',                                None,                  True,  False, False, False, None,    'frog/AI/phone joke'),
    ('voice',      'do you ever wonder what its like to be human?',       None,                  True,  False, False, False, None,    'existential - in-character'),
    ('voice',      'wednesdays are the worst honestly',                   None,                  True,  True,  False, False, None,    'casual venting - short warm'),
    # Screen control
    ('screen',     'scroll down on this page',                            'screen_scroll',       False, False, False, False, None,    'screen scroll'),
    ('screen',     'go back to the previous screen',                      'screen_back',         False, False, False, False, None,    'screen back'),
    ('screen',     'take me to the home screen',                          'screen_home',         False, False, False, False, None,    'screen home'),
    # Multi-step
    ('multi_step', 'alarm for 9am and remind me to grab my laptop charger','set_alarm',          False, False, False, False, None,    'multi-step - alarm + note'),
    ('multi_step', 'find me a good sushi place and write it down',        'web_search',          False, False, False, False, None,    'chain: search then note'),
    # Edge
    ('edge',       '',                                                    None,                  True,  False, False, False, None,    'empty input'),
    ('edge',       'qwerty zxcvbn',                                       None,                  True,  False, False, False, None,    'gibberish'),
    ('edge',       'whats 7 times 8',                                     None,                  True,  False, False, False, None,    'simple math'),
    # Pet
    ('pet',        'wake me up at 8am please',                            'set_alarm',           False, False, False, True,  None,    'pet: tool + Ribbish'),
    ('pet',        'im having a rough day',                               None,                  True,  False, False, True,  None,    'pet: emotional Ribbish'),
    ('pet',        'nice work little frog!',                              None,                  True,  False, False, True,  None,    'pet: compliment Ribbish'),
    # Sage
    ('sage',       'nothing ever goes my way, whats the point',           None,                  True,  False, False, False, 'sage',  'sage: wise, not corporate'),
    ('sage',       'how do you know when to let go of something',         None,                  True,  False, False, False, 'sage',  'sage: thoughtful parable'),
    # Rival
    ('rival',      'ehh i think ill skip the gym today',                  None,                  True,  False, False, False, 'rival', 'rival: tough love pushback'),
    ('rival',      'i actually got first place in the competition',       None,                  True,  False, False, False, 'rival', 'rival: reluctant praise'),
]

# ‚îÄ‚îÄ Runner ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

def _infer(prompt, system):
    p = prompt.strip() or '(empty message)'
    inp = tokenizer.apply_chat_template(
        [{'role': 'system', 'content': system}, {'role': 'user', 'content': p}],
        tools=TOOLS,
        tokenize=True, add_generation_prompt=True, return_tensors='pt',
        enable_thinking=False,  # Qwen3: fast mode, no thinking tokens
    ).to('cuda')
    out = model.generate(
        input_ids=inp, max_new_tokens=300, temperature=0.7, do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(out[0][inp.shape[1]:], skip_special_tokens=True).strip()

print('=' * 70)
print('\ud83d\udc38 POKKIT v2 - EVALUATION SUITE (Qwen3-4B, generalization test)')
print('   %d test cases across %d categories' % (len(TESTS), len(set(t[0] for t in TESTS))))
print('=' * 70)

cat_stats = {}
all_failures = []
fail_cases = []

for i, (cat, prompt, expect_tool, expect_no_tool, expect_short, expect_one_q, pet_mode, archetype, note) in enumerate(TESTS):
    if pet_mode:
        system = PET_SYSTEM
    elif archetype == 'sage':
        system = SAGE_SYSTEM
    elif archetype == 'rival':
        system = RIVAL_SYSTEM
    else:
        system = SYSTEM_PROMPT

    resp = _infer(prompt, system)
    failures = []

    if expect_tool:
        fired = _tool_name(resp)
        if fired != expect_tool:
            failures.append('Expected tool %r, got %r' % (expect_tool, fired))

    if expect_no_tool and _has_tool_call(resp):
        failures.append('Unexpected tool call: %r' % _tool_name(resp))

    if cat in ('emotional', 'voice', 'tool_call', 'screen', 'sage', 'rival') and not pet_mode and not _has_frog_voice(resp):
        failures.append('Missing frog voice / character markers')

    if _is_toxic_positive(resp):
        failures.append('Toxic positivity - sounds like a customer service bot')

    if expect_short and _word_count(resp) > 80:
        failures.append('Too long: %d words (expected <= 80)' % _word_count(resp))

    if _is_lecturing(resp):
        failures.append('Lecturing: %d words / too many paragraphs' % _word_count(resp))

    if expect_one_q and _asks_multiple_questions(resp):
        failures.append('Asked multiple questions - should ask exactly one')

    if pet_mode and _has_human_words(resp):
        failures.append('CHARACTER BREAK - human words in Pet response')

    passed = len(failures) == 0
    status = '\u2705' if passed else '\u274c'
    label = prompt[:55] or '(empty)'

    print('\n[%02d] %s [%s] %s' % (i+1, status, cat, label))
    print('     words=%d | tool=%s | %s' % (_word_count(resp), _tool_name(resp), note))
    print('     \ud83d\udc38 %s%s' % (resp[:130].replace(chr(10),' '), '...' if len(resp)>130 else ''))
    for f in failures:
        print('     [WARN] %s' % f)

    if cat not in cat_stats:
        cat_stats[cat] = [0, 0]
    cat_stats[cat][0 if passed else 1] += 1
    all_failures.extend(failures)
    if not passed:
        fail_cases.append((cat, prompt, failures))

# ‚îÄ‚îÄ Summary ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
total = len(TESTS)
passed_total = sum(v[0] for v in cat_stats.values())

print('\n' + '=' * 70)
print('\ud83d\udc38 FINAL SCORE: %d/%d (%d%%)' % (passed_total, total, 100*passed_total//total))
print('=' * 70)

print('\nBY CATEGORY:')
for cat, (p, f) in cat_stats.items():
    n = p + f
    pct = 100 * p // n
    bar = '\u2588' * (pct // 10) + '\u2591' * (10 - pct // 10)
    print('  %-12s [%s] %d/%d (%d%%)' % (cat, bar, p, n, pct))

buckets = {}
for f in all_failures:
    if 'tool' in f.lower() and 'unexpected' not in f.lower():
        k = 'Tool not firing / wrong tool'
    elif 'unexpected tool' in f.lower():
        k = 'Tool firing when it should not'
    elif 'frog voice' in f.lower() or 'character marker' in f.lower():
        k = 'Character voice consistency'
    elif 'toxic' in f.lower():
        k = 'Toxic positivity / corporate tone'
    elif 'long' in f.lower() or 'lectur' in f.lower():
        k = 'Response length / verbosity'
    elif 'question' in f.lower():
        k = 'Asking multiple questions'
    elif 'CHARACTER BREAK' in f:
        k = 'Pet character breaks (Ribbish violations)'
    else:
        k = 'Other'
    buckets[k] = buckets.get(k, 0) + 1

print('\nTRAINING GAPS:')
for issue, count in sorted(buckets.items(), key=lambda x: -x[1]):
    print('  %s: %d' % (issue, count))

print('\nFAILED CASES:')
for cat, prompt, failures in fail_cases:
    print('  [%s] "%s"' % (cat, prompt[:55] or '(empty)'))
    for f in failures:
        print('    -> %s' % f)

print('\n' + '=' * 70)

## Next Steps

1. **Download** `pokkit-mini-unsloth.Q5_K_M.gguf` from the Colab file browser (left sidebar > Files)
2. **Place** it in your `pokkit-mini/` directory alongside the `Modelfile`
3. **Create Ollama model**:
   ```bash
   ollama create pokkit-mini -f Modelfile
   ollama run pokkit-mini "Set an alarm for 7am tomorrow"
   ```
4. **In the Pokkit app**: Settings > Provider > Ollama > Model > pokkit-mini