# Gemma Fine-tuning with Unsloth

Fine-tuning Gemma 2b model to create GemmaCare0625-LoRA.

In [None]:
import os

# Set HF token if needed
# os.environ["HF_TOKEN"] = "your_token_here"

## Install Dependencies

In [None]:
%pip install unsloth --upgrade --no-cache-dir
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    %pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

## Load Model

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b",
    max_seq_length = max_seq_length,
    dtype = None,        # None for auto detection.
    load_in_4bit = True, # Use 4bit quantization to reduce memory usage.
)

## Load Local Dataset

In [None]:
from datasets import load_dataset
import os

# Load local dataset
dataset_path = os.path.join("..", "data", "train.jsonl")
dataset = load_dataset("json", data_files=dataset_path, split="train")

In [None]:
# Check dataset sample
print(dataset[0])

## Prompt Template

In [None]:
alpaca_prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # EOS_TOKEN is necessary.

## Format Dataset

In [None]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    # EOS_TOKEN is necessary, otherwise your generation will go on forever!
    texts = [alpaca_prompt_template.format(instruction, input, output) + EOS_TOKEN
                                  for instruction, input, output in
                                  zip(instructions, inputs, outputs)]
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

## Initial Inference (Before Training)

Let's test the model before training to see baseline performance.

In [None]:
# Test model before training
from transformers import TextStreamer

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Test with a sample prompt
test_prompt = alpaca_prompt_template.format(
    "What is artificial intelligence?",
    "",
    ""
)

inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)

print("=== BEFORE TRAINING ===")
with torch.no_grad():
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=100)

print("\n" + "="*50 + "\n")

## LoRA Configuration

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # LoRA attention dimension
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  # Alpha parameter for LoRA scaling
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # Rank stabilized LoRA
    loftq_config = None, # LoRA-Fine-Tuning-Aware Quantization
)

## Training Configuration

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    # num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = 60,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    output_dir = "outputs",
    report_to = "none",
)

## Train Model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    # Setting packing as False can speed up training five times
    # for short sequences.
    packing = False,
    args = training_args
)

Start training:

In [None]:
trainer_stats = trainer.train()

## Save Model

In [None]:
new_model = "gemma_ft_unsloth"
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

In [None]:
# Push the trained model to Hub (optional)
# model.push_to_hub("your_hf_username/gemma_ft_unsloth")

## Post-Training Inference

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = new_model, # Your finetuned model name
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

Test the fine-tuned model:

In [None]:
from transformers import TextStreamer

# Test the same prompt after training
test_prompt = alpaca_prompt_template.format(
    "What is artificial intelligence?",
    "",
    ""
)

inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)

print("=== AFTER TRAINING ===")
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=100)