# Unsloth Gemma3 Finetuning LORA


### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install transformers==4.51.3
    !pip install --no-deps unsloth

### Unsloth

`FastModel` supports loading nearly any model now! This includes Vision and Text models!

In [None]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.1: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


We now add LoRA adapters so we only need to update a small amount of parameters!

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


<a name="Data"></a>
### Data Prep


In [None]:
import pandas as pd
from datasets import Dataset
from unsloth import FastModel
from trl import SFTTrainer, SFTConfig

df = pd.read_csv(
    "/content/train.csv",
    engine="python",
    on_bad_lines="skip",
    quoting=0
)

label_cols = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
def labels_to_str(row):
    labs = [c for c in label_cols if row[c] == 1]
    return ",".join(labs) if labs else "none"
df["label_str"] = df.apply(labels_to_str, axis=1)

dataset = Dataset.from_pandas(df, split="train")

In [None]:
print(df[label_cols].dtypes)
print(df[label_cols].apply(pd.Series.unique))
print(df[label_cols].sum())


toxic            int64
severe_toxic     int64
obscene          int64
threat           int64
insult           int64
identity_hate    int64
dtype: object
   toxic  severe_toxic  obscene  threat  insult  identity_hate
0      0             0        0       0       0              0
1      1             1        1       1       1              1
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


In [None]:
df["text"] = df.apply(lambda r: (
    "<start_of_turn>user\n"
    "Classify the following comment:\n"
    f"{r['comment_text']}\n"
    "<start_of_turn>model\n"
    "Answer: " + r["label_str"]
), axis=1)

In [None]:
print(df[ df[label_cols].sum(axis=1) > 0 ][["comment_text","label_str"]].head())


                                         comment_text  \
6        COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK   
12  Hey... what is it..\n@ | talk .\nWhat is it......   
16  Bye! \n\nDon't look, come or think of comming ...   
42  You are gay or antisemmitian? \n\nArchangel WH...   
43           FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!   

                             label_str  
6    toxic,severe_toxic,obscene,insult  
12                               toxic  
16                               toxic  
42  toxic,obscene,insult,identity_hate  
43                toxic,obscene,insult  


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df.reset_index(drop=True), preserve_index=False)

In [None]:
dataset[6]

{'id': '0002bcb3da6cb337',
 'comment_text': 'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK',
 'toxic': 1,
 'severe_toxic': 1,
 'obscene': 1,
 'threat': 0,
 'insult': 1,
 'identity_hate': 0,
 'label_str': 'toxic,severe_toxic,obscene,insult',
 'text': '<start_of_turn>user\nClassify the following comment:\nCOCKSUCKER BEFORE YOU PISS AROUND ON MY WORK\n<start_of_turn>model\nAnswer: toxic,severe_toxic,obscene,insult'}

In [None]:
dataset[12]["text"]



<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
model, tokenizer = FastModel.from_pretrained(
    model_name="unsloth/gemma-3-4b-it",
    max_seq_length=2048,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.6.1: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers=False,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=8, lora_alpha=8, lora_dropout=0.0, bias="none", random_state=3407,
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        max_steps=20,
        learning_rate=2e-4,
        optim="adamw_8bit",
        weight_decay=0.01,
        logging_steps=5,
        report_to="none",
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/159571 [00:00<?, ? examples/s]

In [None]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\nClassify the following comment:\nHowever, the Moonlite edit noted by golden daph was me (on optus ...)  Wake up wikkis.  So funny\n<start_of_turn>model\nAnswer: none'

Now let's print the masked out example - you should see only the answer is present:

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
4.283 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 159,571 | Num Epochs = 1 | Total steps = 20
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 14,901,248/4,000,000,000 (0.37% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,6.078
10,4.0278
15,3.6299
20,3.1121


<a name="Inference"></a>
### Inference


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [
    {
    "role": "system",
    "content": 'You are a toxicity detector, identify the types of toxicity out of this ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] and just return option(s) out of the list'
    },
    {
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : ":: Wallamoose was changing the cited material to say things the original source did not say. In response to his objections, I modified the article as we went along. I was not just reverting him. I repeatedly asked him to use the talk page. I've been trying to add to the article for a long time.  It's so thin on content. This is wrong.",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\nYou are a toxicity detector, identify the types of toxicity out of this ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] and just return one option out of the list\n\n:: Wallamoose was changing the cited material to say things the original source did not say. In response to his objections, I modified the article as we went along. I was not just reverting him. I repeatedly asked him to use the talk page. I\'ve been trying to add to the article for a long time.  It\'s so thin on content. This is wrong.<end_of_turn>\n<start_of_turn>model\nsevere_toxic\n<end_of_turn>']

In [None]:
# model.save_pretrained("gemma-3-toxic")
# tokenizer.save_pretrained("gemma-3-toxic")
model.push_to_hub("Subhrato20/gemma-3-toxic-v2", token = "")
tokenizer.push_to_hub("Subhrato20/gemma-3-toxic-v2", token = "")

README.md:   0%|          | 0.00/601 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/59.7M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Subhrato20/gemma-3-toxic-v2


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]