In [2]:
from datasets import load_dataset
import json
import os
import torch
from transformers import TextStreamer
from utils import UnslothWrapper
from unsloth import FastLanguageModel

Skipping import of cpp extensions due to incompatible torch version 2.10.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info

Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [4]:
OUTPUT_PATH = "data/prasanna_data.json"

In [5]:
MODEL_NAME = "HuggingFaceTB/SmolLM2-360M-Instruct"
NEW_MODEL_NAME = "Prasanna-SmolLM-360M"
MAX_SEQ_LENGTH = 1024
LOAD_IN_4BIT = False

print(f"‚è≥ Loading {MODEL_NAME}...")
model, tokenizer = UnslothWrapper.load_model_and_tokenizer(
    model_name=MODEL_NAME,
    model_type="language",
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=LOAD_IN_4BIT,
)


model = UnslothWrapper.get_peft_model(
    model=model,
    r=32,
    lora_alpha=64,
    lora_dropout=0.0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

# dataset loading 
dataset = load_dataset("json", data_files=OUTPUT_PATH, split="train")
dataset = UnslothWrapper.format_chat_dataset(
    dataset=dataset,
    tokenizer=tokenizer,
    messages_field="messages",
    output_field="text",
    add_generation_prompt=False,
)

bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
TRAIN_ARGS = {
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 2,
    "warmup_steps": 10,
    "num_train_epochs": 3,
    "learning_rate": 3e-4,
    "weight_decay": 0.01,
    "lr_scheduler_type": "cosine",
    "logging_steps": 5,
    "optim": "adamw_torch",
    "output_dir": "outputs/unsloth-sft",
    "save_strategy": "no",
    "bf16": bf16,
    "fp16": not bf16,
}

trainer = UnslothWrapper.create_sft_trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args_kwargs=TRAIN_ARGS,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,
    packing=True,
)

print("üöÄ Starting Training...")
UnslothWrapper.train(trainer=trainer)

‚è≥ Loading HuggingFaceTB/SmolLM2-360M-Instruct...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 4050 Laptop GPU. Num GPUs = 1. Max memory: 5.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-360M-Instruct does not have a padding token! Will use pad_token = <|endoftext|>.


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 950/950 [00:00<00:00, 20460.23 examples/s]


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=4): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 950/950 [00:00<00:00, 1895.23 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.


ü¶• Unsloth: Padding-free auto-enabled, enabling faster training.
üöÄ Starting Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 950 | Num Epochs = 3 | Total steps = 180
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 17,367,040 of 379,188,160 (4.58% trained)


Step,Training Loss
5,3.029
10,2.6354
15,2.1144
20,1.8645
25,1.5957
30,1.5179
35,1.4253
40,1.3237
45,1.2079
50,1.1221


TrainOutput(global_step=180, training_loss=0.9048647960027059, metrics={'train_runtime': 79.1585, 'train_samples_per_second': 36.004, 'train_steps_per_second': 2.274, 'total_flos': 520889602412160.0, 'train_loss': 0.9048647960027059, 'epoch': 3.0})

In [None]:
# save model 
QUANTIZATION_8_METHOD = "q8_0"
QUANTIZATION_6_METHOD= "q6_k"
QUANTIZATION_4_METHOD="q4_k_m"
model.save_pretrained_gguf(
    NEW_MODEL_NAME,
    tokenizer,
    quantization_method = QUANTIZATION_4_METHOD
)

In [7]:
print("\nü§ñ Training Complete! Running Inference Test...")

# Enable native 2x faster inference
FastLanguageModel.for_inference(model) 

messages = [
    {"role": "system", "content": "You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills."},
    {"role": "user", "content": "tell me more about prasanna?"}  ,
    {"role": "user", "content": "what does copy cat project do ?"}  
]

# Prepare inputs
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must be True for generation
    return_tensors = "pt",
).to("cuda") # Use "cpu" if you are not on GPU, but "cuda" is recommended for training

# Generate
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 100, use_cache = True)


ü§ñ Training Complete! Running Inference Test...
<|im_start|>system
You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills.<|im_end|>
<|im_start|>user
tell me more about prasanna?<|im_end|>
<|im_start|>user
what does copy cat project do ?<|im_end|>
<|im_start|>assistant
Copy Cat is a smart clipboard companion built with Python, PyQt6, and OpenAI Whisper via faster-whisper. It integrates with VS Code and keeps track of all open files, projects, and clipboard histories.<|im_end|>


In [None]:
# print("‚è≥ Loading Dataset...")
# dataset = load_dataset("json", data_files="data/prasanna_data.json", split="train")

# MODEL_NAME = "HuggingFaceTB/SmolVLM-500M-Instruct"
# MAX_SEQ_LENGTH = 1024 # 135M models don't need massive context
# DTYPE = None 
# LOAD_IN_4BIT = False # 135M is so small, we don't even need 4bit loading!

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = MODEL_NAME,
#     max_seq_length = MAX_SEQ_LENGTH,
#     dtype = DTYPE,
#     load_in_4bit = LOAD_IN_4BIT,
# )

# def formatting_prompts_func(examples):
#     convos = examples["messages"] 
#     texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
#     return { "text" : texts, }
# dataset = dataset.map(formatting_prompts_func, batched = True,)

# len(dataset)
# for idx in range(len(dataset)):
#     c = formatting_prompts_func({
#         "messages": [dataset[idx]["messages"]]
#     })
#     print(c)

‚è≥ Loading Dataset...
==((====))==  Unsloth 2026.1.4: Fast Idefics3 patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 4050 Laptop GPU. Num GPUs = 1. Max memory: 5.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.
{'text': ['<|im_start|>System: <end_of_utterance>\nUser: <end_of_utterance>\nAssistant: <end_of_utterance>\n']}
{'text': ['<|im_start|>System: <end_of_utterance>\nUser: <end_of_utterance>\nAssistant: <end_of_utterance>\n']}
{'text': ['<|im_start|>System: <end_of_utterance>\nUser: <end_of_utterance>\nAssistant: <end_of_utterance>\n']}
{'text': ['<|im_start|>System: <end_of_utterance>\nUser: <end_of_utterance>\nAssistan