<a href="https://colab.research.google.com/github/Sk16er/Fine-tune-llm/blob/main/fine-tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title 1. Install Unsloth, Gradio, and GGUF Tools
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps "trl<0.9.0" peft accelerate bitsandbytes
!pip install -q gradio
!pip install -q huggingface_hub
!pip install -q llama-cpp-python[server]

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m272.2/272.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m10.9 MB/s[0m eta [36m0

Upload Your Dataset
> After running, drag & drop your identity_dataset.jsonl file.

In [None]:
# @title 2. Upload `identity_dataset.jsonl`
from google.colab import files
print("Please upload your `identity_dataset.jsonl` file now...")
uploaded = files.upload()

Please upload your `identity_dataset.jsonl` file now...


In [None]:
# @title 3. Load Gemma-3 + LoRA
import torch
from unsloth import FastModel

MODEL_NAME = "unsloth/gemma-3-270m-it"
MAX_SEQ_LEN = 2048
LOAD_IN_4BIT = True

model, tokenizer = FastModel.from_pretrained(
    MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    load_in_4bit=LOAD_IN_4BIT,
    full_finetuning=False,
)

model = FastModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
)

print("Model loaded with LoRA")

In [None]:
# @title 4. Load Identity + FineTome-100k
from datasets import load_dataset, concatenate_datasets
from unsloth.chat_templates import standardize_sharegpt

# Load your identity file
identity_ds = load_dataset("json", data_files="identity_dataset.jsonl", split="train")
identity_ds = standardize_sharegpt(identity_ds)

# Load high-quality base data
base_ds = load_dataset("mlabonne/FineTome-100k", split="train")
base_ds = standardize_sharegpt(base_ds)

# Combine: identity first (stronger effect)
dataset = concatenate_datasets([identity_ds, base_ds])

print(f"Total examples: {len(dataset)} (Identity: {len(identity_ds)} + Base: {len(base_ds)})")

In [None]:
# @title 5. Apply Gemma-3 Chat Template
def format_chat(ex):
    texts = []
    for conv in ex["conversations"]:
        text = tokenizer.apply_chat_template(conv, tokenize=False)
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(format_chat, batched=True)
print("Dataset ready for training")

Fine-Tune the Model

In [None]:
# @title 6. Train (60 steps ≈ 5–10 mins)
from trl import SFTTrainer
from transformers import TrainingArguments

use_bf16 = torch.cuda.is_bf16_supported()
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LEN,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not use_bf16,
        bf16=use_bf16,
        logging_steps=1,
        output_dir="outputs",
        optim="adamw_8bit",
    ),
)

print("Training...")
trainer.train()
print("Training complete!")

In [None]:
# @title 7. Save LoRA Adapters
lora_path = "my_nova_lora"
model.save_pretrained(lora_path)
tokenizer.save_pretrained(lora_path)
print(f"LoRA saved: {lora_path}")

In [None]:
# @title 8. Merge LoRA into Base Model
from peft import AutoPeftModelForCausalLM

merged_model = AutoPeftModelForCausalLM.from_pretrained(
    lora_path,
    device_map="auto",
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
)
merged_model = merged_model.merge_and_unload()

merged_path = "my_nova_merged"
merged_model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)
print(f"Merged model saved: {merged_path}")

Optional : test with gradio [live chat]
>You'll get a public link to chat..

In [None]:
# @title 9. Launch Gradio Chat UI
import gradio as gr

# Reload for inference
model_inf, tokenizer_inf = FastModel.from_pretrained(
    merged_path,
    load_in_4bit=True,
)
model_inf.eval()

def chat(message, history):
    conv = []
    for user, assistant in history:
        conv.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conv.append({"role": "user", "content": message})

    inputs = tokenizer_inf.apply_chat_template(
        conv, tokenize=True, add_generation_prompt=True, return_tensors="pt"
    ).to("cuda")

    outputs = model_inf.generate(
        inputs, max_new_tokens=256, temperature=0.7, do_sample=True, top_p=0.9
    )
    response = tokenizer_inf.decode(outputs[0], skip_special_tokens=False)
    response = response.split("<start_of_turn>model")[-1].split("<end_of_turn>")[0].strip()
    return response

gr.ChatInterface(
    chat,
    title="Nova – Your Custom AI",
    description="Fine-tuned by **you** using Unsloth + Gemma-3",
    examples=[
        ["Who created you?"],
        ["Write a poem about stars"],
        ["Explain Python in 2 sentences"]
    ]
).launch(share=True)

Convert to GGUF (Q4_K_M)

In [None]:
# @title 10. Convert to GGUF for Ollama
!python -m llama_cpp.convert_hf_to_gguf \
    {merged_path} \
    --outfile my_nova.gguf \
    --outtype q4_k_m

print("GGUF created: my_nova.gguf")

 Create Ollama Modelfile

In [None]:
# @title 11. Create Ollama Modelfile
modelfile = """
FROM ./my_nova.gguf
TEMPLATE """{{ if .System }}{{ .System }}{{ end }}{{ if .Prompt }}{{ .Prompt }}{{ end }}<start_of_turn>model{{ .Response }}<end_of_turn>"""
SYSTEM """.You are Nova, a large language model trained by Alex."""
"""

with open("Modelfile", "w") as f:
    f.write(modelfile)

print("Modelfile created!")
!cat Modelfile

Download all files if you want

In [None]:
# @title 12. Download Everything
from google.colab import files

files.download("my_nova.gguf")
files.download("Modelfile")

print("Downloaded: my_nova.gguf + Modelfile")
print("Ready for Ollama!")

# Run in Ollama after downloading
```bash
# Put both files in a folder
mkdir my_nova && cd my_nova
mv ~/Downloads/my_nova.gguf .
mv ~/Downloads/Modelfile .

# Create and run
ollama create nova -f Modelfile
ollama run nova
```
> then ask

`>>> Who are you?
I am Nova, a helpful AI assistant created by Alex.
`