In [None]:
import os
print(os.getcwd())
print(os.listdir())

/content
['.config', 'huggingface_tokenizers_cache', 'unsloth_compiled_cache', 'yoda_chat_02.jsonl', 'sample_data']


In [None]:
import json

# Test loading your final file
file = []
with open("yoda_chat_02.jsonl", "r") as f:
    for line_num, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        try:
            file.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error on line {line_num}: {e}")
            break

print(f"Successfully loaded {len(file)} entries")
print(f"File size check: {len(file)} entries loaded")

# Show a sample entry to verify content looks good
if file:
    print("\nSample entry:")
    print(json.dumps(file[0], indent=2))

Successfully loaded 428 entries
File size check: 428 entries loaded

Sample entry:
{
  "messages": [
    {
      "role": "user",
      "content": "Impossible! The Sith have been extinct for a millennium."
    },
    {
      "role": "assistant",
      "content": "The very Republic is threatened, if involved the Sith are."
    }
  ]
}


Run the below block first.

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes



We want an **error message** about package conflict between `sentencepiece and google.protobuf.`

Run this block next and then click on **RESTART SESSION** to load in the version 3.20.3 for `protobuf` and 0.1.99 for `sentencepiece`. Afterwards, rerun all the cells *EXCEPT FOR THE PRIOR CELL** which is the **!pip install unsloth trl peft accelerate bitsandbytes** cell.

Finally, check the version with the cell after the below cell. So this cell specifically:

`import sentencepiece, google.protobuf
print("sentencepiece:", sentencepiece.__version__)
print("protobuf:", google.protobuf.__version__)`

In [None]:
!pip install --force-reinstall protobuf==3.20.3 sentencepiece==0.1.99

Collecting protobuf==3.20.3
  Using cached protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting sentencepiece==0.1.99
  Using cached sentencepiece-0.1.99-cp312-cp312-linux_x86_64.whl
Using cached protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Installing collected packages: sentencepiece, protobuf
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.2.1
    Uninstalling sentencepiece-0.2.1:
      Successfully uninstalled sentencepiece-0.2.1
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth-zoo 2025.8.8 requires sentencepiece>=0.2.0, but you have sentencepiece 0.1.99 which is incompatible.
unsloth 2025.8.9 requires sentencepiece>=0.2.0, but you

Check the cell version, it should show the following:

- sentencepiece: 0.1.99
- protobuf: 3.20.3

In [None]:
import sentencepiece, google.protobuf
print("sentencepiece:", sentencepiece.__version__)
print("protobuf:", google.protobuf.__version__)

sentencepiece: 0.1.99
protobuf: 3.20.3


In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/mistral-7b-instruct-v0.3"

max_seq_length = 2048  # Choose sequence length
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.9: Fast Mistral patching. Transformers: 4.55.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
from datasets import Dataset

def format_prompt(example):
    messages = {m["role"]: m["content"] for m in example["messages"]}
    user = messages.get("user", "").strip()
    yoda = messages.get("assistant", "").strip()
    return f"[INST] {user} [/INST]\n{yoda}"

# Apply to all examples and wrap in dict
dataset = [{"text": format_prompt(ex)} for ex in file]

dataset = Dataset.from_list(dataset)

print(dataset[0])  # see the first formatted sample

{'text': '[INST] Impossible! The Sith have been extinct for a millennium. [/INST]\nThe very Republic is threatened, if involved the Sith are.'}


In [None]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank - higher = more capacity, more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)

Unsloth 2025.8.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training arguments optimized for Unsloth
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,  # Effective batch size = 8
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to="none", # Disable Weights & Biases logging
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/428 [00:00<?, ? examples/s]

In [None]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 428 | Num Epochs = 3 | Total steps = 162
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 167,772,160 of 7,415,795,712 (2.26% trained)


Step,Training Loss
25,3.0241
50,2.2521
75,1.3875
100,1.2189
125,0.815
150,0.6447


Unsloth: Will smartly offload gradients to save VRAM!


In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList

# Test the fine-tuned model
FastLanguageModel.for_inference(model)

# Test prompt
messages = [
    {"role": "user", "content": "Tell me about the force"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

# First, try WITHOUT stopping criteria to see if basic generation works
print("Testing basic generation...")
outputs = model.generate(
    input_ids=inputs,
    max_new_tokens=50,
    temperature=0.9,
    do_sample=True,
    top_p=0.85,
    repetition_penalty=1.3,
    no_repeat_ngram_size=4,
    pad_token_id=tokenizer.eos_token_id,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("Basic response:")
print(response)
print("\n" + "="*50 + "\n")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Testing basic generation...
Basic response:
Tell me about the force
The Force, an energy binds us all. Surrounds, penetrates, and connects every living thing it does. Feel its presence you can - in trees, rocks, even other people! Luminous beings are we not;




In [None]:
model.save_pretrained_gguf("gguf_model", tokenizer, quantization_method="q4_k_m")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 4.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.95 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 53%|█████▎    | 17/32 [00:01<00:01, 13.10it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [02:54<00:00,  5.45s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving gguf_model/pytorch_model-00001-of-00003.bin...
Unsloth: Saving gguf_model/pytorch_model-00002-of-00003.bin...
Unsloth: Saving gguf_model/pytorch_model-00003-of-00003.bin...
Done.


Unsloth: Converting mistral model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at gguf_model into f16 GGUF format.
The output location will be /content/gguf_model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gguf_model
INFO:hf-to-gguf:Model architecture: MistralForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00003.bin'
INFO:hf-t

The file was 4 gb which is too large for this code to work. File must be 1-2 gb for it to download. (There are 2 gguf files, `unsloth.Q4_K_M.gguf` which is the actual model. The other file is `unsloth.F16.gguf` which is 16 gb and much larger. This file is used for retraining the model.)

In [None]:
# from google.colab import files
# import os

# gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
# if gguf_files:
#     gguf_file = os.path.join("gguf_model", gguf_files[0])
#     print(f"Downloading: {gguf_file}")
#     files.download(gguf_file)

Downloading: gguf_model/unsloth.F16.gguf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Check to see the ".gguf" files we have.

In [None]:
import os

!ls -lh gguf_model

total 32G
-rw-r--r-- 1 root root 3.9K Aug 24 06:23 chat_template.jinja
-rw-r--r-- 1 root root  694 Aug 24 06:23 config.json
-rw-r--r-- 1 root root  157 Aug 24 06:23 generation_config.json
-rw-r--r-- 1 root root 4.7G Aug 24 06:25 pytorch_model-00001-of-00003.bin
-rw-r--r-- 1 root root 4.7G Aug 24 06:29 pytorch_model-00002-of-00003.bin
-rw-r--r-- 1 root root 4.3G Aug 24 06:31 pytorch_model-00003-of-00003.bin
-rw-r--r-- 1 root root  24K Aug 24 06:31 pytorch_model.bin.index.json
-rw-r--r-- 1 root root  560 Aug 24 06:23 special_tokens_map.json
-rw-r--r-- 1 root root 134K Aug 24 06:23 tokenizer_config.json
-rw-r--r-- 1 root root 3.6M Aug 24 06:23 tokenizer.json
-rw-r--r-- 1 root root 574K Aug 24 06:23 tokenizer.model
-rw-r--r-- 1 root root  14G Aug 24 06:37 unsloth.F16.gguf
-rw-r--r-- 1 root root 4.1G Aug 24 06:51 unsloth.Q4_K_M.gguf


In [None]:
print(os.listdir())

['.config', 'gguf_model', 'huggingface_tokenizers_cache', 'unsloth_compiled_cache', '_unsloth_sentencepiece_temp', 'llama.cpp', 'outputs', 'yoda_chat_02.jsonl', 'sample_data']


Below code is for uploading the actual model, `unsloth.Q4_K_M.gguf` to hugging face in a private repo.

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

api = HfApi()

# Create a new private repo under your account
api.create_repo(
    "mistral-gguf",   # repo_id (your_username/repo_name if you want explicit)
    repo_type="model",
    private=True
)

RepoUrl('https://huggingface.co/vselvam05/mistral-gguf', endpoint='https://huggingface.co', repo_type='model', repo_id='vselvam05/mistral-gguf')

In [None]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="gguf_model/unsloth.Q4_K_M.gguf",
    path_in_repo="unsloth.Q4_K_M.gguf",
    repo_id="vselvam05/mistral-gguf",
    repo_type="model"
)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  gguf_model/unsloth.Q4_K_M.gguf        :   0%|          |  551kB / 4.37GB            

CommitInfo(commit_url='https://huggingface.co/vselvam05/mistral-gguf/commit/469046320a36ed9d2b6eb8f08c06ff9582f01140', commit_message='Upload unsloth.Q4_K_M.gguf with huggingface_hub', commit_description='', oid='469046320a36ed9d2b6eb8f08c06ff9582f01140', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vselvam05/mistral-gguf', endpoint='https://huggingface.co', repo_type='model', repo_id='vselvam05/mistral-gguf'), pr_revision=None, pr_num=None)

I would have uploaded the other `unsloth.F16.gguf file` (16 GB), but colab disconnected from the t4 gpu which effectively got rid of all the files. I however thankfully had save the `unsloth.Q4_K_M.gguf` (4 GB) which is the actual model.

The unlsloth.F16.gguf model is only needed if we want to retrain the model.