In [1]:
%%capture
import os, re

import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
!pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth
!pip install transformers==4.55.4

In [1]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.9: Fast Qwen3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 2080 Ti. Num GPUs = 1. Max memory: 11.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.36s/it]


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.8.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [3]:
import json
json_file_path = "data/conversations_output.json"

with open(json_file_path, 'r') as f:
    custom_dataset = json.load(f)

In [6]:
conversations = tokenizer.apply_chat_template(
    custom_dataset["conversations"],
    tokenize = False,
)

In [10]:
print(len(conversations))

100


In [18]:
import pandas as pd
conversations_series = pd.Series(conversations)
conversations_series.name = "text"
print(len(conversations_series))

100


In [19]:
from datasets import Dataset

final_dataset = Dataset.from_pandas(pd.DataFrame(conversations_series))
final_dataset = final_dataset.shuffle(seed = 3407)

In [20]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = final_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 99.86 examples/s]


In [21]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 3 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 87,293,952 of 8,278,029,312 (1.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.8639
2,3.8493
3,3.6789
4,3.377
5,3.0733
6,2.7653
7,2.6068
8,2.3672
9,2.1997
10,2.0375


In [25]:
content = """
/no_think
You are an AI-powered geo-regulation checker. Your task is to analyze 
the provided context to determine if a feature requires geo-specific compliance actions to meet legal requirement. 
If the feature is business driven, select 'No Compliance Logic Needed'. If uncertain, select 'Requires Further Review'. 
Only use the information provided in the context to make your determination. 
Your final answer MUST be in the specified JSON format.
\n\n
---EXAMPLES---
'Feature reads user location to enforce France's copyright rules (download blocking)' - 'Compliance Logic Needed'
'Geofences feature rollout in US for market testing' - 'No Compliance Logic Needed' (Business decision, not regulatory)'
'A video filter feature is available globally except KR' - 'Requires Further Review' (didn't specify the intention, need human evaluation)
---CONTEXT---
\n\n
---USER QUESTION---
Here is the feature and feature description to validate:
Curfew login blocker with ASL and GH for Utah minors, To comply with the Utah Social Media Regulation Act, we are implementing a curfew-based login restriction for users under 18. The system uses ASL to detect minor accounts and routes enforcement through GH to apply only within Utah boundaries. The feature activates during restricted night hours and logs activity using EchoTrace for auditability. This allows parental control to be enacted without user-facing alerts, operating in ShadowMode during initial rollout.
\n\n
"""

messages = [
    {"role" : "user", "content" : content}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

{"feature_type": "Legal Requirement", "compliance_status": "Compliance Logic Needed", "reasoning": "This is a direct response to a specific state law, requiring geo-specific enforcement logic and age verification, which are complex to implement."}<|im_end|>


In [26]:
model.save_pretrained("saved_model/lora_model")  # Local saving
tokenizer.save_pretrained("saved_model/lora_model")

('saved_model/lora_model/tokenizer_config.json',
 'saved_model/lora_model/special_tokens_map.json',
 'saved_model/lora_model/chat_template.jinja',
 'saved_model/lora_model/vocab.json',
 'saved_model/lora_model/merges.txt',
 'saved_model/lora_model/added_tokens.json',
 'saved_model/lora_model/tokenizer.json')

In [1]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    load_in_4bit = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.9: Fast Qwen3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 2080 Ti. Num GPUs = 1. Max memory: 11.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.29s/it]
Unsloth 2025.8.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [2]:
model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 7.4 out of 15.57 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 11%|█████████████▋                                                                                                             | 4/36 [00:00<00:04,  6.92it/s]
We will save to Disk and not RAM now.
 19%|███████████████████████▉                                                                                                   | 7/36 [00:07<00:32,  1.12s/it]
Makefile:6: *** Build system changed:
 The Makefile build has been replaced by CMake.

 For build instructions see:
 https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md

.  Stop.
sh: 1: cmake: not found


make: Entering directory '/mnt/c/techjam2025/finetuning/llama.cpp'
make: Leaving directory '/mnt/c/techjam2025/finetuning/llama.cpp'


RuntimeError: *** Unsloth: Failed compiling llama.cpp using os.system(...) with error 32512. Please report this ASAP!