In [17]:
# %%capture
# !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
# !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
# !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

###  Apply PEFT (LoRA tuning)

This code applies Low-Rank Adaptation (LoRA) to the pre-trained language model using Unsloth’s optimized PEFT (Parameter-Efficient Fine-Tuning) implementation.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Dataset - **tatsu-lab/alpaca**
link: https://huggingface.co/datasets/tatsu-lab/alpaca

The Alpaca dataset has the following columns:

"instruction" → The task or query.

"input" → Additional context (sometimes empty).

"output" → The model's response.

In [4]:
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from unsloth.chat_templates import standardize_sharegpt

# set tokenizer
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# `formatting_prompts_func` to use Alpaca's columns
def formatting_prompts_func(example):
    return {
        "text": f"<|start_header_id|>user<|end_header_id|>\n\n{example['instruction']}\n\n"
                f"<|start_header_id|>assistant<|end_header_id|>\n\n{example['output']}"
    }

#load dataset
dataset = load_dataset("tatsu-lab/alpaca", split = "train")

# standardize
dataset = standardize_sharegpt(dataset)

# Apply formatting
dataset = dataset.map(formatting_prompts_func)

README.md:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)-00000-of-00001-a09b74b3ef9c3b56.parquet:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

The Alpaca dataset has the following columns:

"instruction" → The task or query.

"input" → Additional context (sometimes empty).

"output" → The model's response.

In [5]:
print(dataset[5])

{'instruction': 'Identify the odd one out.', 'input': 'Twitter, Instagram, Telegram', 'output': 'Telegram', 'text': '<|start_header_id|>user<|end_header_id|>\n\nIdentify the odd one out.\n\n<|start_header_id|>assistant<|end_header_id|>\n\nTelegram'}


### Trainer

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/52002 [00:00<?, ? examples/s]

In [7]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nIdentify the odd one out.\n\n<|start_header_id|>assistant<|end_header_id|>\n\nTelegram'

In [8]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
1.131 GB of memory reserved.


### Train model
trainer_stats = trainer.train()

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 52,002 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 22,544,384/1,000,000,000 (2.25% trained)


Step,Training Loss
1,4.1003
2,3.1638
3,3.2277
4,3.922
5,3.1845
6,2.2991
7,2.3329
8,2.0742
9,2.1178
10,2.1152


Unsloth: Will smartly offload gradients to save VRAM!


In [10]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

61.9739 seconds used for training.
1.03 minutes used for training.
Peak reserved memory = 2.121 GB.
Peak reserved memory for training = 0.99 GB.
Peak reserved memory % of max memory = 14.388 %.
Peak reserved memory for training % of max memory = 6.716 %.


### Test

In [16]:
from unsloth.chat_templates import get_chat_template
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},
    {"role": "user", "content": "Explain the difference between supervised and unsupervised learning in simple terms."}
]

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

for message in messages:
    # Convert single message into formatted input
    inputs = tokenizer.apply_chat_template(
        [message],  # Process one message at a time
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")

    print(f"\nUser: {message['content']}")

    # Generate response
    _ = model.generate(
        input_ids=inputs,
        streamer=text_streamer,  # Stream output
        max_new_tokens=128,
        use_cache=True,
        temperature=1.5,
        top_p=0.9  # Corrected from min_p to top_p
    )



User: Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,
4, 8, 13, 21, 34, 55, 89, 144.  

These are the first 7 terms. The rest could also be continued using similar formulas.
7, 144, 243, 377, 610, 987, 1597.  

These are the second sequence. 

These are numbers in the golden ratio of Fibonacci. These numbers describe their relationships. Some people believe they describe patterns of the natural world, for they occur frequently in the universe's behavior, from seed shapes to animal shapes, from flowers to trees. Some also find that natural disasters have a

User: Explain the difference between supervised and unsupervised learning in simple terms.
Supervised and unsupervised learning is another way to classify data into two categories - classification and clustering.

Superervised learning involves learning and fitting the rules about the classification for example in images and audio to predict classes such as cat or elephant etc. This way we can predict things.

Unsupervised learni

### Inference

The fine-tuned LLaMA model generates instruction-following responses with improved coherence and alignment, mimicking human-like task completion based on the Alpaca dataset.