In [2]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch
import json
import os 
from trl import SFTTrainer
from transformers import TrainingArguments , TextStreamer

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:
# CONFIG
RAW_DATA_PATH = "raw_knowledge.txt"
OUTPUT_PATH = "train.jsonl"
SYSTEM_PROMPT = "You are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills."

# 1. Create dummy data if file doesn't exist (For this demo)
if not os.path.exists(RAW_DATA_PATH):
    os.makedirs("data", exist_ok=True)
    with open(RAW_DATA_PATH, "w") as f:
        f.write("""
        Prasanna is a software engineer with 3+ years of experience.
        He is building 'Open-Tunnel', a Go-based tool to expose local ports.
        He is proficient in Python, PyTorch, and Linux system administration.
        He is interested in deploying ML models on production environments.
        """)

# 2. Manual Curation (Best for accuracy)
# We manually define Q&A pairs. Automated generation often hallucinates facts.
dataset = [
    {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "Who is Prasanna?"},
            {"role": "assistant", "content": "Prasanna is a software engineer with over 3 years of experience specializing in Machine Learning, Python, and Linux systems."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "Tell me about the Open-Tunnel project."},
            {"role": "assistant", "content": "Open-Tunnel is a project developed by Prasanna using Go (Golang). It is designed to expose local server ports to the internet, similar to ngrok, but as an open-source alternative."}
        ]
    },
    {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "What is his tech stack?"},
            {"role": "assistant", "content": "He primarily works with Python and PyTorch for ML tasks, and Go for system tools. He is also an expert in Linux server management and deployment."}
        ]
    }
]



In [3]:
# 3. Save to JSONL
with open(OUTPUT_PATH, "w") as f:
    for entry in dataset:
        json.dump(entry, f)
        f.write("\n")

print(f"‚úÖ Dataset saved to {OUTPUT_PATH} with {len(dataset)} examples.")

‚úÖ Dataset saved to train.jsonl with 3 examples.


In [20]:


# --- CONFIGURATION ---
# We switch to SmolLM2 (135M version)
MODEL_NAME = "HuggingFaceTB/SmolLM2-360M-Instruct"
NEW_MODEL_NAME = "Prasanna-SmolLM-135M"
MAX_SEQ_LENGTH = 1024 # 135M models don't need massive context
DTYPE = None 
LOAD_IN_4BIT = False # 135M is so small, we don't even need 4bit loading!

# 1. Load Model
print(f"‚è≥ Loading {MODEL_NAME}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
)

# 2. Add LoRA Adapters
# For such a small model, we target all modules to squeeze out maximum intelligence
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Higher rank because the model is small (needs more capacity to learn)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
)

# 3. Load & Format Dataset
dataset = load_dataset("json", data_files="data/train.jsonl", split="train")

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True,)

# 4. Train
print("üöÄ Starting Training...")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    dataset_num_proc = 2,
    packing = True, # Critical for speed
    args = TrainingArguments(
        per_device_train_batch_size = 32, # Huge batch size because model is tiny
        gradient_accumulation_steps = 1,
        warmup_steps = 10,
        max_steps = 50, # Needs more steps to converge
        learning_rate = 1e-3, # Higher LR for smaller models 
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_torch",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine", 
        output_dir = "outputs",
    ),
)

trainer.train()

# 5. Export to GGUF (Q8_0 for Quality)
# Since the model is only 135M, we can use Q8 (High Quality) and it will STILL be small (~150MB)
# print("üì¶ Converting to GGUF (Q8_0)...")
# model.save_pretrained_gguf(NEW_MODEL_NAME, tokenizer, quantization_method = "q8_0")

‚è≥ Loading HuggingFaceTB/SmolLM2-360M-Instruct...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 4050 Laptop GPU. Num GPUs = 1. Max memory: 5.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-360M-Instruct does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth 2026.1.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 946.94 examples/s]


üöÄ Starting Training...
Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00,  7.81 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3 | Num Epochs = 50 | Total steps = 50
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 17,367,040 of 379,188,160 (4.58% trained)


Step,Training Loss
1,2.5612
2,2.5612
3,2.5482
4,2.4625
5,2.2722
6,2.033
7,1.7882
8,1.5394
9,1.3037
10,1.0649


TrainOutput(global_step=50, training_loss=0.45970685195177796, metrics={'train_runtime': 8.4451, 'train_samples_per_second': 189.458, 'train_steps_per_second': 5.921, 'total_flos': 27788587488000.0, 'train_loss': 0.45970685195177796, 'epoch': 50.0})

In [None]:
# from unsloth import FastLanguageModel
# from datasets import load_dataset
# from trl import SFTTrainer
# from transformers import TrainingArguments
# import torch

# # --- CONFIGURATION ---
# MODEL_NAME = "unsloth/Qwen2.5-0.5B-Instruct"
# NEW_MODEL_NAME = "Prasanna-Qwen-0.5B-v1"
# MAX_SEQ_LENGTH = 2048
# DTYPE = None 
# LOAD_IN_4BIT = False # ‚úÖ Correct. FP16 is faster for 0.5B on T4/A10 GPUs.

# # 1. Load Model
# print("‚è≥ Loading Model...")
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = MODEL_NAME,
#     max_seq_length = MAX_SEQ_LENGTH,
#     dtype = DTYPE,
#     load_in_4bit = LOAD_IN_4BIT,
# )

# # 2. Add LoRA Adapters
# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 16, 
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
#                       "gate_proj", "up_proj", "down_proj"],
#     lora_alpha = 16,
#     lora_dropout = 0,
#     bias = "none",
#     use_gradient_checkpointing = "unsloth",
#     random_state = 3407,
# )

# # 3. Load & Format Dataset
# print("‚è≥ Loading Dataset...")
# dataset = load_dataset("json", data_files="data/train.jsonl", split="train")

# def formatting_prompts_func(examples):
#     convos = examples["messages"]
#     texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
#     return { "text" : texts, }

# dataset = dataset.map(formatting_prompts_func, batched = True,)

# # 4. Train (OPTIMIZED)
# print("üöÄ Starting Training...")
# trainer = SFTTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     train_dataset = dataset,
#     dataset_text_field = "text",
#     max_seq_length = MAX_SEQ_LENGTH,
#     dataset_num_proc = 2,
    
#     # ‚ö° OPTIMIZATION 1: Packing
#     # Combines short examples into one sequence. 
#     # Since 0.5B is fast, this maximizes GPU compute.
#     packing = True, 

#     args = TrainingArguments(
#         # ‚ö° OPTIMIZATION 2: Batch Size
#         # 0.5B is tiny. On a T4 (16GB), you can easily do batch_size=16 or 32.
#         # This is much faster than batch_size=2.
#         per_device_train_batch_size = 16, 
        
#         # Lower gradient accumulation since batch size is higher
#         gradient_accumulation_steps = 1, 
        
#         # ‚ö° OPTIMIZATION 3: Steps & Scheduler
#         # 60 steps is very short. For small models, 'cosine' scheduler 
#         # usually converges better than 'linear'.
#         warmup_steps = 10,
#         max_steps = 30, # Increased slightly; adjust based on dataset size
        
#         learning_rate = 1e-4,
#         fp16 = not torch.cuda.is_bf16_supported(),
#         bf16 = torch.cuda.is_bf16_supported(),
#         logging_steps = 1,
        
#         # 'adamw_8bit' saves memory, but for 0.5B we have plenty.
#         # Standard 'adamw_torch' might be slightly faster (no de-quant overhead),
#         # but 8bit is fine too. Keeping 8bit for safety.
#         optim = "adamw_8bit",
        
#         weight_decay = 0.01,
#         lr_scheduler_type = "cosine", # Smooth decay for better convergence
#         seed = 3407,
#         output_dir = "outputs",
#     ),
# )

# trainer.train()

# # # 5. Save
# # print(f"üíæ Saving LoRA adapters to {NEW_MODEL_NAME}...")
# # model.save_pretrained(NEW_MODEL_NAME)
# # tokenizer.save_pretrained(NEW_MODEL_NAME)

# # # 6. GGUF Conversion (Since you want this for deployment)
# # print("üì¶ Converting to GGUF (Q4_K_M)...")
# # model.save_pretrained_gguf(NEW_MODEL_NAME, tokenizer, quantization_method = "q4_k_m")

In [27]:
# save model 
QUANTIZATION_8_METHOD = "q8_0"
QUANTIZATION_6_METHOD= "q6_k"
QUANTIZATION_4_METHOD="q4_k_m"
model.save_pretrained_gguf(
    NEW_MODEL_NAME,
    tokenizer,
    quantization_method = QUANTIZATION_4_METHOD
)

Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /home/prasanna/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `Prasanna-SmolLM-135M`: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  3.02it/s]


Successfully copied all 1 files from cache to `Prasanna-SmolLM-135M`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 12157.40it/s]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.21it/s]


Unsloth: Merge process complete. Saved to `/home/prasanna/coding/transformers-playground/src/models/my-model/Prasanna-SmolLM-135M`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF bf16 might take 3 minutes.
\        /    [2] Converting GGUF bf16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: llama.cpp found in the system. Skipping installation.
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into bf16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['SmolLM2-360M-Instruct.BF16.gguf']
Unsloth: [2] Converting GGUF bf16 into q4_k_m. This might take 10 minutes...
Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['SmolLM2-360M-Instruct.Q4_K_M.gguf']
Unsloth:

{'save_directory': 'Prasanna-SmolLM-135M',
 'gguf_files': ['SmolLM2-360M-Instruct.Q4_K_M.gguf'],
 'modelfile_location': None,
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}

In [23]:
print("\nü§ñ Training Complete! Running Inference Test...")

# Enable native 2x faster inference
FastLanguageModel.for_inference(model) 

messages = [
    {"role": "system", "content": "You are Prasanna's AI Assistant. prasanna was born in 2001, prasanna brother name is jagadesh"},
    {"role": "user", "content": "prasanna brother name?"}  
]

# Prepare inputs
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must be True for generation
    return_tensors = "pt",
).to("cuda") # Use "cpu" if you are not on GPU, but "cuda" is recommended for training

# Generate
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 64, use_cache = True)


ü§ñ Training Complete! Running Inference Test...
<|im_start|>system
You are Prasanna's AI Assistant. prasanna was born in 2001, prasanna brother name is jagadesh<|im_end|>
<|im_start|>user
prasanna brother name?<|im_end|>
<|im_start|>assistant
Jagadesh is his brother. He is 30 years old and works as a software engineer.<|im_end|>


In [4]:
print("‚è≥ Loading Dataset...")
dataset = load_dataset("json", data_files="data/train.jsonl", split="train")

MODEL_NAME = "unsloth/Llama-3.2-1B-Instruct"
MAX_SEQ_LENGTH = 1024 # 135M models don't need massive context
DTYPE = None 
LOAD_IN_4BIT = False # 135M is so small, we don't even need 4bit loading!

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
)

def formatting_prompts_func(examples):
    convos = examples["messages"] 
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return { "text" : texts, }
dataset = dataset.map(formatting_prompts_func, batched = True,)


for idx in range(len(dataset)):
    c = formatting_prompts_func({
        "messages": [dataset[idx]["messages"]]
    })
    print(c)

‚è≥ Loading Dataset...
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 4050 Laptop GPU. Num GPUs = 1. Max memory: 5.638 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 448.62 examples/s]

{'text': ["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 09 Feb 2026\n\nYou are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho is Prasanna?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPrasanna is a software engineer with over 3 years of experience specializing in Machine Learning, Python, and Linux systems.<|eot_id|>"]}
{'text': ["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 09 Feb 2026\n\nYou are Prasanna's AI Assistant. You answer questions about his professional background, projects, and skills.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTell me about the Open-Tunnel project.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nOpen-Tunnel is a project developed by Prasanna using Go (Golang). It is designed t


