In [None]:
# Installs Unsloth, Xformers (Flash Attention) and other dependencies
# %%capture
!pip install unsloth
# Also get the latest nightly Unsloth for Llama 3.2 support
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

# Verify installation
!unsloth version

Collecting xformers<0.0.27
  Using cached xformers-0.0.26.post1.tar.gz (4.1 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl<0.9.0
  Using cached trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.8.6-py3-none-any.whl (245 kB)
Building wheels for collected packages: xformers


  Building wheel for xformers (setup.py) ... [?25l[?25hdone
  Created wheel for xformers: filename=xformers-0.0.26.post1-cp312-cp312-linux_x86_64.whl size=13331208 sha256=2233d695c363e712e518aad88350dbf310b4b82e3a6b95c96deeb1532530ae1b
  Stored in directory: /root/.cache/pip/wheels/c4/a9/4f/803f19d92c4fc04c2bf65b0ea573a6672ccec24add7c905a60
Successfully built xformers
Installing collected packages: xformers, trl
  Attempting uninstall: xformers
    Found existing installation: xformers 0.0.33.post1
    Uninstalling xformers-0.0.33.post1:
      Successfully uninstalled xformers-0.0.33.post1
  Attempting uninstall: trl
    Found existing installation: trl 0.23.0
    Uninstalling

In [None]:
%%capture
# We install unsloth directly from github to get the latest compatibility
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We install the other libraries without forcing old versions that break on Python 3.12
!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes

In [None]:
import unsloth
print(unsloth.__version__)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
2025.11.3


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Llama 3.2 can go up to 128k, but 2048 is good for home GPUs
dtype = None # Auto-detects the GPU type
load_in_4bit = True # enables the 4x memory saving

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # The rank. Higher rank might be smarter but takes more vram.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth", # important for saving memory
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.11.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Data Cell

In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

# 1. Setup the Llama 3 Chat Template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1", # this works for Llama 3.2 as well
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts }

# 2. Load the local file
dataset = load_dataset("json", data_files="sample_data/concatenated_datasets.jsonl", split="train")


# col rename
if "messages" in dataset.column_names:
    dataset = dataset.rename_column("messages", "conversations")


# 4. Apply formatting
dataset = dataset.map(formatting_prompts_func, batched = True,)

print("Success! First example formatted:")
print(dataset[0]["text"])

Map:   0%|          | 0/13872 [00:00<?, ? examples/s]

Success! First example formatted:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Say, Jim, how about going for a few beers after dinner?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

You know that is tempting but is really not good for our fitness.<|eot_id|><|start_header_id|>user<|end_header_id|>

What do you mean? It will help us to relax.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Do you really think so? I don't. It will just make us fat and act silly. Remember last time?<|eot_id|><|start_header_id|>user<|end_header_id|>

I guess you are right. But what shall we do? I don't feel like sitting at home.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I suggest a walk over to the gym where we can play singsong and meet some of our friends.<|eot_id|><|start_header_id|>user<|end_header_id|>

That's a good idea. I hear Mary and Sal

## Fine Tuning

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can speed up training for short sequences
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60, # <--- INCREASE THIS based on your data size (e.g. 1 epoch)
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none"
    ),
)

trainer.train()

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "hey, i have tomorrow an exam that is so hard and includes a large amount of content, and i am so nervous and afraid from this exam, what should i do?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must be True for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs))

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhey, i have tomorrow an exam that is so hard and includes a large amount of content, and i am so nervous and afraid from this exam, what should i do?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nwell, first of all, you need to make a plan, and then you can start to study. I suggest you to make a study schedule, and then you can make a plan to finish it.<|eot_id|>']


In [None]:
# Save LoRA adapters only
model.save_pretrained("my_personal_chatbot_lora_v2")
tokenizer.save_pretrained("my_personal_chatbot_lora_v2")

# save full merged model
# model.save_pretrained_merged("my_personal_chatbot_merged", tokenizer, save_method = "merged_16bit")

('my_personal_chatbot_lora_v2/tokenizer_config.json',
 'my_personal_chatbot_lora_v2/special_tokens_map.json',
 'my_personal_chatbot_lora_v2/chat_template.jinja',
 'my_personal_chatbot_lora_v2/tokenizer.json')

In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-bnb-4bit

## Saving the Whole model

In [None]:
repo_id = "michaelHenry1/Llama-3.2-3B-Instruct-bnb-4bit_finetuned"

# saving to 16-bit means higher quality but larger file)
model.push_to_hub_merged(
    repo_id,
    tokenizer,
    save_method = "merged_16bit",
    token = True,
)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ..._finetuned/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [01:38<01:38, 98.78s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [02:40<00:00, 80.05s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit:   0%|          | 0/2 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0001-of-00002.safetensors:   1%|          | 33.5MB / 4.97GB            

Unsloth: Merging weights into 16bit:  50%|█████     | 1/2 [02:50<02:50, 170.25s/it]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0002-of-00002.safetensors:   0%|          | 4.26MB / 1.46GB            

Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [03:45<00:00, 112.99s/it]


Unsloth: Merge process complete. Saved to `/content/michaelHenry1/Llama-3.2-3B-Instruct-bnb-4bit_finetuned`


## Saving the Adapters only

In [None]:
repo_id = "michaelHenry1/Llama-3.2-3B-Instruct-bnb-4bit_ADAPTERS"

model.push_to_hub(repo_id, token = True)
tokenizer.push_to_hub(repo_id, token = True)

README.md:   0%|          | 0.00/602 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 45.8kB / 97.3MB            

Saved model to https://huggingface.co/michaelHenry1/Llama-3.2-3B-Instruct-bnb-4bit_ADAPTERS


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpdrtyea55/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            