# Imports

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from datasets import load_dataset, concatenate_datasets
from unsloth.chat_templates import get_chat_template
from unsloth import FastModel

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E4B-it", # Or "unsloth/gemma-3n-E2B-it"
    dtype = None, # None for auto detection
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.7.8: Fast Gemma3N patching. Transformers: 4.54.0.dev0.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.179 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Fetching 3 files: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.16it/s]


# Function process

In [4]:
def enem_chat_template(x, source="enem", metadata_col=None):
	label_to_id = {
		"A": 0,
		"B": 1,
		"C": 2,
		"D": 3,
  		"E": 4
	}
	id_to_label = {v: k for k, v in label_to_id.items()}
	alternatives = [f"\n{id_to_label[i]} - {a}" for i,a in enumerate(x["alternatives"])]
 	
	human = x["question"] + "\n" + "\n".join(x["description"]) + "\n\n"  + "alternatives:" + "".join(alternatives) 
	assistant = f"{x["label"]} - {x["alternatives"][label_to_id[x["label"]]]}" 
	
	return {"conversations": [{"role":"human", "content": human},
							  {"role":"assistant", "content": assistant}
							],
			"source": source,
			"metadata": x[metadata_col] if metadata_col else source
	}
	
def qa_chat_template(x, question_col="question", answer_col="answer", source="piaui_qa", metadata_col=None):
	human = x[question_col]
	assistant = x[answer_col]
	
	return {"conversations": [{"role":"human", "content": human},
							{"role":"assistant", "content": assistant}
							],
			"source": source,
			"metadata": x[metadata_col] if metadata_col else source
	}
  
def portugues_mcq_chat_template(x, source="portugues_mcq", metadata_col=None):
	# Ensure the order of alternatives is A-E
	label_to_id = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}
	id_to_label = {v: k for k, v in label_to_id.items()}

	# Extract alternatives in the right order
	alternatives_list = [x["options"][id_to_label[i]] for i in range(5)]
	alternatives = [f"\n{id_to_label[i]} - {alt}" for i, alt in enumerate(alternatives_list)]

	# Build prompt (no description in this dataset)
	human = x["question"] + "\n\n" + "alternatives:" + "".join(alternatives)
	assistant = f"{x['answer']} - {x['options'][x['answer']]}"

	return {
		"conversations": [
			{"role": "human", "content": human},
			{"role": "assistant", "content": assistant}
		],
		"source": source,
		"metadata": x[metadata_col] if metadata_col else source 
	}
  
def qa_chat_template2(x, source="qa_pairs", metadata_col=None):
	pairs = x['qa_pairs']
	conversations = []
	for pair in pairs:
		human = pair['question']
		assistant = pair['answer']
		conversations.append({"role": "human", "content": human})
		conversations.append({"role": "assistant", "content": assistant})
	return {"conversations": conversations,
			"source": source,
			"metadata": x[metadata_col] if metadata_col else source 
	}

<a name="Data"></a>
### Data Prep

In [5]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [6]:

enem2022 = load_dataset("maritaca-ai/enem", "2022", split = "train")
enem2023 = load_dataset("maritaca-ai/enem", "2023", split = "train")
enem2024 = load_dataset("maritaca-ai/enem", "2024", split = "train")
enem = concatenate_datasets([enem2022, enem2023, enem2024])
enem = enem.filter(lambda x: x["label"].lower() != "anulado")
enem = enem.map(lambda x: enem_chat_template(x, source="enem", metadata_col="exam"), remove_columns=enem.column_names)


Map: 100%|██████████| 537/537 [00:00<00:00, 7180.08 examples/s]


In [7]:
datasets_l = []
datasets_l.append(enem)

In [8]:

piaui_qa = load_dataset("institutogaia/piaui_qa", split="train")
piaui_qa = piaui_qa.map(lambda x: qa_chat_template(x, source="piaui_qa"), remove_columns=piaui_qa.column_names)

Generating train split: 100%|██████████| 997/997 [00:00<00:00, 36926.65 examples/s]
Generating validation split: 100%|██████████| 11/11 [00:00<00:00, 3887.21 examples/s]
Map: 100%|██████████| 997/997 [00:00<00:00, 23354.26 examples/s]


In [9]:
datasets_l.append(piaui_qa)

We get the first 3000 rows of the dataset

In [10]:
portugues_mcq = load_dataset("institutogaia/portugues_mcq", split="train")
portugues_mcq = portugues_mcq.map(
    lambda x: portugues_mcq_chat_template(x, source="portugues_mcq", metadata_col="book"),
    remove_columns=portugues_mcq.column_names
)


Generating train split: 100%|██████████| 671/671 [00:00<00:00, 38992.73 examples/s]
Map: 100%|██████████| 671/671 [00:00<00:00, 11897.96 examples/s]


In [11]:
datasets_l.append(portugues_mcq)

In [12]:
textbooks_qa  = load_dataset("institutogaia/textbooks_qa", split="train")
textbooks_qa = textbooks_qa.map(
	lambda x: qa_chat_template(x, question_col="question", answer_col="answer", source="textbooks_qa", metadata_col="book"),
		remove_columns=textbooks_qa.column_names
)

Downloading data: 100%|██████████| 49/49 [00:01<00:00, 44.75files/s]
Generating train split: 124811 examples [00:00, 189080.16 examples/s]
Map: 100%|██████████| 124811/124811 [00:05<00:00, 23386.13 examples/s]


In [13]:
datasets_l.append(textbooks_qa)

In [14]:
wikipedia_dumps_pt_qa = load_dataset("institutogaia/wikipedia_dumps_pt_qa", split="train")
wikipedia_dumps_pt_qa = wikipedia_dumps_pt_qa.map(
	lambda x: qa_chat_template2(x,
                             	source="wikipedia_dumps_pt_qa",
								metadata_col="article_title"),
	remove_columns=wikipedia_dumps_pt_qa.column_names,
)

Generating train split: 100%|██████████| 9890/9890 [00:00<00:00, 68034.97 examples/s]
Map: 100%|██████████| 9890/9890 [00:00<00:00, 14060.77 examples/s]


In [15]:
datasets_l.append(wikipedia_dumps_pt_qa)

In [16]:
concatenated_dataset = concatenate_datasets(datasets_l)
concatenated_dataset 

Dataset({
    features: ['conversations', 'source', 'metadata'],
    num_rows: 136906
})

In [17]:
concatenated_dataset

Dataset({
    features: ['conversations', 'source', 'metadata'],
    num_rows: 136906
})

In [32]:
enem = concatenated_dataset.filter(lambda x: x["source"] == "enem")
filtered_dt = concatenated_dataset.filter(lambda x: not (x["source"] == "enem" ))

Filter: 100%|██████████| 136906/136906 [00:01<00:00, 70698.05 examples/s]
Filter: 100%|██████████| 136906/136906 [00:01<00:00, 69280.15 examples/s]


In [33]:
concatenated_dataset

Dataset({
    features: ['conversations', 'source', 'metadata'],
    num_rows: 136906
})

In [35]:
filtered_dt

Dataset({
    features: ['conversations', 'source', 'metadata'],
    num_rows: 136369
})

In [36]:
dt = filtered_dt.train_test_split(test_size=0.2, seed=42)

In [41]:
dt

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'metadata'],
        num_rows: 109095
    })
    test: Dataset({
        features: ['conversations', 'source', 'metadata'],
        num_rows: 27274
    })
})

In [40]:
enem_train = enem.filter(lambda x: x["metadata"] != "2024")
enem2024 = enem.filter(lambda x: x["metadata"] == "2024")

In [42]:
dt["train"] = concatenate_datasets([dt["train"], enem_train])
dt["test"] = concatenate_datasets([dt["test"], enem2024])

In [43]:
dt

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'metadata'],
        num_rows: 109453
    })
    test: Dataset({
        features: ['conversations', 'source', 'metadata'],
        num_rows: 27453
    })
})

In [44]:
dt.save_to_disk("gemma_train_test_split")

Saving the dataset (1/1 shards): 100%|██████████| 109453/109453 [00:00<00:00, 147961.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27453/27453 [00:00<00:00, 143039.68 examples/s]


We now use `standardize_data_formats` to try converting datasets to the correct format for finetuning purposes!

In [28]:
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(concatenated_dataset)

Let's see how row 100 looks like!

In [29]:
dataset[0]

{'conversations': [{'content': '[[placeholder]]\nNessa tirinha, o comportamento da mulher expressa\nDescrição da imagem: Tirinha apresentada em quatro quadrinhos. O primeiro quadrinho apresenta um castelo localizado no alto de uma colina, cercado por floresta e montanhas. O sol está baixo. Duas pessoas conversam dentro dele. Uma delas diz: “Now that you are my bride, you will never leave this castle!”. A outra exclama: “Wow! Your library is amazing!”. No segundo quadrinho, apresentados de forma estilizada, estão um homem grandalhão usando chapéu e segurando um bastão, e uma mulher com os cabelos presos em rabo de cavalo e usando um vestido. Ele fala: “Beyond the castle is a high wall with no gate, and beyond that is a deep, dark forest with no path”. A mulher, que está retirando um livro de uma estante alta e repleta de livros, comenta: “I suppose it’s my library too, now we’re married”. No terceiro quadrinho, o homem ergue o dedo indicador e diz: “The forest is crawling with ravenous 

We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`. We remove the `<bos>` token using removeprefix(`'<bos>'`) since we're finetuning. The Processor will add this token before training and the model expects only one.

In [30]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

In [31]:
dataset[0]['text']

'<start_of_turn>user\n[[placeholder]]\nNessa tirinha, o comportamento da mulher expressa\nDescrição da imagem: Tirinha apresentada em quatro quadrinhos. O primeiro quadrinho apresenta um castelo localizado no alto de uma colina, cercado por floresta e montanhas. O sol está baixo. Duas pessoas conversam dentro dele. Uma delas diz: “Now that you are my bride, you will never leave this castle!”. A outra exclama: “Wow! Your library is amazing!”. No segundo quadrinho, apresentados de forma estilizada, estão um homem grandalhão usando chapéu e segurando um bastão, e uma mulher com os cabelos presos em rabo de cavalo e usando um vestido. Ele fala: “Beyond the castle is a high wall with no gate, and beyond that is a deep, dark forest with no path”. A mulher, que está retirando um livro de uma estante alta e repleta de livros, comenta: “I suppose it’s my library too, now we’re married”. No terceiro quadrinho, o homem ergue o dedo indicador e diz: “The forest is crawling with ravenous wolves, ma

Let's see how the chat template did! Notice there is no `<bos>` token as the processor tokenizer will be adding one.

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [32]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # Should leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [33]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 10000,
         num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

[2025-07-24 16:31:42,982] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-07-24 16:31:43,436] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!

In [34]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Let's verify masking the instruction part is done! Let's print the 100th row again.  Notice how the sample only has a single `<bos>` as expected!

In [35]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><start_of_turn>user\n## Técnica modifica proteína do veneno de cascavel e permite criar fármaco que modula a coagulação sanguínea\nO veneno da cascavel pode causar hemorragia com risco de morte a quem é picado pela serpente. No entanto, pesquisadores do Brasil e da Bélgica desenvolveram uma molécula de interesse farmacêutico, a PEG-collineína-1, a partir de uma proteína encontrada no veneno dessa cobra, capaz de modular a coagulação sanguínea. Embora a técnica não seja nova, foi a primeira vez que o método foi usado a partir de uma toxina animal na sua forma recombinante, ou seja, produzida em laboratório por um fungo geneticamente modificado.\nEsse novo medicamento apresenta potencial aplicação para\n\n\nalternatives:\nA - impedir a formação de trombos, típicos em alguns casos de acidente vascular cerebral.\nB - tratar consequências da anemia profunda, em razão da perda de grande volume de sangue.\nC - evitar a manifestação de urticárias, comumente relacionadas a processos alérg

Now let's print the masked out example - you should see only the answer is present:

In [36]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                                                                                                                                                                                                                                                                 A - impedir a formação de trombos, típicos em alguns casos de acidente vascular cerebral.<end_of_turn>\n'

In [38]:
# @title Show current memory stats
import gc
import torch
gc.collect()
torch.cuda.empty_cache()
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 80GB HBM3. Max memory = 79.179 GB.
9.305 GB of memory reserved.


# Let's train the model!

To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [39]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 136,727 | Num Epochs = 1 | Total steps = 17,091
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 19,210,240 of 7,869,188,432 (0.24% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,9.0699
2,10.0203
3,8.7424
4,8.1151
5,7.0164
6,7.0596
7,6.1895
8,5.4524
9,5.6373
10,4.8405


Unsloth: Will smartly offload gradients to save VRAM!


KeyboardInterrupt: 

In [16]:
# @title Show final memory and time stats
gc.collect()
torch.cuda.empty_cache()
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

107.6149 seconds used for training.
1.79 minutes used for training.
Peak reserved memory = 12.355 GB.
Peak reserved memory for training = 3.05 GB.
Peak reserved memory % of max memory = 15.604 %.
Peak reserved memory for training % of max memory = 3.852 %.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "Continue the sequence: 1, 1, 2, 3, 5, 8,",
    }]
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)


In [36]:
print(tokenizer.batch_decode(outputs)[0])

<bos><start_of_turn>user
Continue the sequence: 1, 1, 2, 3, 5, 8,<end_of_turn>
<start_of_turn>model
1715648717178717546171711758771774717175671717117171997171799000


In [28]:
tokenizer.batch_decode(outputs)

['<bos><start_of_turn>user\nContinue the sequence: 1, 1, 2, 3, 5, 8,<end_of_turn>\n<start_of_turn>model\nTheR<end_of_turn>']

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "Why is the sky blue?",}]
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")

from transformers import TextStreamer
_ = model.generate(
    **inputs,
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
    
)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [24]:
model.save_pretrained("gemma-3n")  # Local saving
tokenizer.save_pretrained("gemma-3n")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

['gemma-3n/processor_config.json']

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastModel
    model, tokenizer = FastModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

messages = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "What is Gemma-3N?",}]
}]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")

from transformers import TextStreamer
_ = model.generate(
    **inputs,
    max_new_tokens = 256, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

### Saving to float16 for VLLM

We also support saving to `float16` directly for deployment! We save it in the folder `gemma-3N-finetune`. Set `if False` to `if True` to let it run!

In [26]:
if False: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3N-finetune", tokenizer)

If you want to upload / push to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!

In [27]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3N-finetune", tokenizer,
        token = "hf_..."
    )

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now for all models! For now, you can convert easily to `Q8_0, F16 or BF16` precision. `Q4_K_M` for 4bit will come later!

In [31]:
if False: # Change to True to save to GGUF
    model.save_pretrained_gguf(
        "gemma-3N-finetune",
        quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
    )

Likewise, if you want to instead push to GGUF to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!

In [32]:
if False: # Change to True to upload GGUF
    model.push_to_hub_gguf(
        "gemma-3N-finetune",
        quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
        repo_id = "HF_ACCOUNT/gemma-3N-finetune-gguf",
        token = "hf_...",
    )

Now, use the `gemma-3N-finetune.gguf` file or `gemma-3N-finetune-Q4_K_M.gguf` file in llama.cpp or a UI based system like Jan or Open WebUI. You can install Jan [here](https://github.com/janhq/jan) and Open WebUI [here](https://github.com/open-webui/open-webui)

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!

<div class="align-center">
  <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>

  Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
</div>
