In [None]:
# STEP 1: Install Unsloth + Dependencies
!pip install -qqq "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
from torch import __version__ as torch_version
from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(torch_version) < V("2.4.0") else "xformers"
!pip install -qqq --no-deps {xformers} trl peft accelerate bitsandbytes triton

# STEP 2: Imports
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth.chat_templates import get_chat_template


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.7.0+cu126 with CUDA 1206 (you have 2.6.0+cu124)
    Python  3.11.12 (you have 3.11.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
#!pip install trl==0.15.2 --upgrade --quiet



In [None]:
# STEP 3: Load Instruct Model (chat-aligned)
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
  max_seq_length = max_seq_length,
  load_in_4bit = True,
  dtype = None,
)

==((====))==  Unsloth 2025.5.2: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# STEP 4: Attach LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.5.2 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
# STEP 5: Prepare Tokenizer Template (chatml)
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
)

In [None]:
# STEP 6: Load Clean Dataset (Guanaco, 500 examples)
from datasets import load_dataset

# 👇 Force re-download without cache
dataset = load_dataset("mlabonne/guanaco-llama2-1k", split="train[:500]", download_mode="force_redownload")

print(dataset.column_names)  # ✅ should show ['conversations']


data/train-00000-of-00001-9ad84bb9cf65a4(…):   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

['text']


In [None]:
# STEP 7: Set Training Arguments for Free Colab (T4-safe)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_steps=5,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    report_to="none",
    save_strategy="no",
    seed=42,
)


In [None]:
# STEP 8: Train Model (PEFT + QLoRA)

from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer

# If not already done:
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    max_seq_length=max_seq_length,
    formatting_func=lambda example: example["text"],
    tokenizer=tokenizer,  # Still needed for packing
    packing=False,
)


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 2 | Total steps = 500
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
1,1.3952
2,0.8766
3,1.9102
4,0.8533
5,2.4817
6,1.5945
7,1.2661
8,1.9738
9,0.1273
10,0.2199


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=500, training_loss=0.036711849099996925, metrics={'train_runtime': 1840.6177, 'train_samples_per_second': 0.543, 'train_steps_per_second': 0.272, 'total_flos': 1.6410543089860608e+16, 'train_loss': 0.036711849099996925})

In [None]:
#STEP 9 : Inference After Training

model = FastLanguageModel.for_inference(model)

# Inference code
messages = [{"from": "human", "value": "What is the difference between permutation and combination?"}]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
attention_mask = (inputs != tokenizer.pad_token_id).long()


In [None]:
from transformers import TextStreamer
streamer = TextStreamer(tokenizer)

_ = model.generate(
    input_ids=inputs,
    attention_mask=attention_mask,
    streamer=streamer,
    max_new_tokens=128,
    use_cache=True,
)

<|im_start|>user
What is the difference between permutation and combination?<|im_end|>
<|im_start|>assistant
What is the difference between permutation and combination?

Permutation and combination are two fundamental concepts in mathematics that are used to describe the arrangement of objects in a set.

Permutation:

A permutation is an arrangement of objects in a set where the order of the objects matters. In other words, a permutation is a rearrangement of the objects in a set where the order of the objects is important.

Example:

Suppose we have a set of 3 objects: {a, b, c}. We can permute these objects in the following ways:

* {a, b, c}
* {a, c, b}
* {b


In [None]:
#STEP 10 : Save Locally

trainer.save_model("llama3-finetuned")  # saves model files to a folder


In [1]:
from huggingface_hub import login

# This will prompt you to paste your Hugging Face access token
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#STEP 11 : Push to Hugging Face Hub
model.push_to_hub_merged("Sirisha4/llama3-finetuned",
    tokenizer=tokenizer,
    save_method="merged_16bit",  # important: FP16 output
)


Unsloth: You are pushing to hub, but you passed your HF username = Sirisha4.
We shall truncate Sirisha4/llama3-finetuned to llama3-finetuned
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.9 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 41%|████      | 13/32 [00:01<00:01, 13.80it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:58<00:00,  1.83s/it]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving llama3-finetuned/pytorch_model-00001-of-00004.bin...
Unsloth: Saving llama3-finetuned/pytorch_model-00002-of-00004.bin...
Unsloth: Saving llama3-finetuned/pytorch_model-00003-of-00004.bin...
Unsloth: Saving llama3-finetuned/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/593 [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/Sirisha4/llama3-finetuned
