In [1]:
from datasets import load_dataset
import json
import os
import torch
from transformers import TextStreamer
from utils import UnslothWrapper
from unsloth import FastLanguageModel

  from .autonotebook import tqdm as notebook_tqdm
Skipping import of cpp extensions due to incompatible torch version 2.10.0+cu128 for torchao version 0.15.0             Please see https://github.com/pytorch/ao/issues/2919 for more info

Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:
import torch 

False

In [3]:
OUTPUT_PATH = "data/prasanna_data.json"

In [4]:
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
NEW_MODEL_NAME = "Prasanna-llama-1B"
MAX_SEQ_LENGTH = 1024
LOAD_IN_4BIT = False

print(f"‚è≥ Loading {MODEL_NAME}...")
model, tokenizer = UnslothWrapper.load_model_and_tokenizer(
    model_name=MODEL_NAME,
    model_type="language",
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=LOAD_IN_4BIT,
)


model = UnslothWrapper.get_peft_model(
    model=model,
    r=32,
    lora_alpha=64,
    lora_dropout=0.0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

# dataset loading 
dataset = load_dataset("json", data_files=OUTPUT_PATH, split="train")
dataset = UnslothWrapper.format_chat_dataset(
    dataset=dataset,
    tokenizer=tokenizer,
    messages_field="messages",
    output_field="text",
    add_generation_prompt=False,
)

bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
TRAIN_ARGS = {
    "per_device_train_batch_size": 8,
    "gradient_accumulation_steps": 2,
    "warmup_steps": 10,
    "num_train_epochs": 3,
    "learning_rate": 3e-4,
    "weight_decay": 0.01,
    "lr_scheduler_type": "cosine",
    "logging_steps": 5,
    "optim": "adamw_torch",
    "output_dir": "outputs/unsloth-sft",
    "save_strategy": "no",
    "bf16": bf16,
    "fp16": not bf16,
}

trainer = UnslothWrapper.create_sft_trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args_kwargs=TRAIN_ARGS,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,
    packing=True,
)

print("üöÄ Starting Training...")
# UnslothWrapper.train(trainer=trainer) 

‚è≥ Loading meta-llama/Llama-3.2-1B-Instruct...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 4050 Laptop GPU. Num GPUs = 1. Max memory: 5.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 950/950 [00:00<00:00, 16627.19 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=4): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 950/950 [00:01<00:00, 747.43 examples/s] 

ü¶• Unsloth: Padding-free auto-enabled, enabling faster training.
üöÄ Starting Training...





In [5]:
# @title Hello  
model.save_pretrained("smollLLM-sft-adapters")
tokenizer.save_pretrained("smollLLM-sft-adapters")

('smollLLM-sft-adapters/tokenizer_config.json',
 'smollLLM-sft-adapters/special_tokens_map.json',
 'smollLLM-sft-adapters/chat_template.jinja',
 'smollLLM-sft-adapters/vocab.json',
 'smollLLM-sft-adapters/merges.txt',
 'smollLLM-sft-adapters/added_tokens.json',
 'smollLLM-sft-adapters/tokenizer.json')

In [5]:
# save model 
QUANTIZATION_8_METHOD = "q8_0"
QUANTIZATION_6_METHOD= "q6_k"
QUANTIZATION_4_METHOD="q4_k_m"
model.save_pretrained_gguf(
    NEW_MODEL_NAME,
    tokenizer,
    quantization_method = QUANTIZATION_4_METHOD
)

Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /home/prasanna/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `Prasanna-llama-1B`: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.61s/it]


Successfully copied all 1 files from cache to `Prasanna-llama-1B`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 10512.04it/s]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:31<00:00, 31.67s/it]


Unsloth: Merge process complete. Saved to `/home/prasanna/coding/transformers-playground/src/models/my-model/Prasanna-llama-1B`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: llama.cpp found in the system. Skipping installation.
Unsloth: Preparing converter script...


[unsloth_zoo.llama_cpp|ERROR]Unsloth: Error during download or introspection of original script: Failed to execute module convert_hf_to_gguf_original_gguf_qhcrht18 from /home/prasanna/coding/transformers-playground/src/models/my-model/llama.cpp/original_gguf_qhcrht18.py
Traceback (most recent call last):
  File "/home/prasanna/coding/transformers-playground/.venv/lib/python3.12/site-packages/unsloth_zoo/llama_cpp.py", line 515, in _load_module_from_path
    spec.loader.exec_module(module)
  File "<frozen importlib._bootstrap_external>", line 995, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/home/prasanna/coding/transformers-playground/src/models/my-model/llama.cpp/original_gguf_qhcrht18.py", line 8813, in <module>
    class GlmMoeDsaModel(DeepseekV2Model):
  File "/home/prasanna/coding/transformers-playground/src/models/my-model/llama.cpp/original_gguf_qhcrht18.py", line 8814, in GlmMoeDsaModel
    model_arch = gguf.MODEL_ARCH.G

RuntimeError: Unsloth: GGUF conversion failed: Failed during download/introspection of original script: Failed to execute module convert_hf_to_gguf_original_gguf_qhcrht18 from /home/prasanna/coding/transformers-playground/src/models/my-model/llama.cpp/original_gguf_qhcrht18.py

In [9]:
print("\nü§ñ Training Complete! Running Inference Test...")

# Enable native 2x faster inference
FastLanguageModel.for_inference(model) 

messages = [
    {"role": "system", "content": "You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills."},
    {"role": "user", "content": "tell me more about prasanna?"}  ,
    {"role": "user", "content": "does prasanna is good developer?"}  
]

# Prepare inputs
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must be True for generation
    return_tensors = "pt",
).to("cuda") # Use "cpu" if you are not on GPU, but "cuda" is recommended for training

# Generate
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 100, use_cache = True)


ü§ñ Training Complete! Running Inference Test...
<|im_start|>system
You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills.<|im_end|>
<|im_start|>user
tell me more about prasanna?<|im_end|>
<|im_start|>user
does prasanna is good developer?<|im_end|>
<|im_start|>assistant
Yes he is a self-taught engineer who dropped out of college and learned everything on his own through building projects and real-world experience.<|im_end|>


In [None]:
# print("‚è≥ Loading Dataset...")
# dataset = load_dataset("json", data_files="data/prasanna_data.json", split="train")

# MODEL_NAME = "HuggingFaceTB/SmolVLM-500M-Instruct"
# MAX_SEQ_LENGTH = 1024 # 135M models don't need massive context
# DTYPE = None 
# LOAD_IN_4BIT = False # 135M is so small, we don't even need 4bit loading!

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = MODEL_NAME,
#     max_seq_length = MAX_SEQ_LENGTH,
#     dtype = DTYPE,
#     load_in_4bit = LOAD_IN_4BIT,
# )

# def formatting_prompts_func(examples):
#     convos = examples["messages"] 
#     texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
#     return { "text" : texts, }
# dataset = dataset.map(formatting_prompts_func, batched = True,)

# len(dataset)
# for idx in range(len(dataset)):
#     c = formatting_prompts_func({
#         "messages": [dataset[idx]["messages"]]
#     })
#     print(c)