In [28]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    # Install huggingface_hub[cli] for easy model uploading
    !pip install -U "huggingface_hub[cli]"

In [None]:
HF_USERNAME = "Your Username"  # Replace with your HuggingFace username
MODEL_NAME = ""  # Choose your model name
HF_MODEL_ID = f"{HF_USERNAME}/{MODEL_NAME}"

In [None]:
HF_TOKEN = "Your Access Token"

In [31]:
from huggingface_hub import login
login(token=HF_TOKEN)

In [32]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

In [33]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Change just this line
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [34]:
# Step 6: Prepare the model for fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # LoRA rank
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [35]:
from datasets import load_dataset
from unsloth import to_sharegpt, standardize_sharegpt, apply_chat_template

# Load the dataset
dataset = load_dataset("csv", data_files="others_dataset.csv")
dataset = dataset['train']

Generating train split: 0 examples [00:00, ? examples/s]

In [36]:
dataset = to_sharegpt(
    dataset,
    merged_prompt="{instruction}[[\nYour input is:\n{input}]]",
    output_column_name="output",
    conversation_extension=3,
)
dataset = standardize_sharegpt(dataset)

Merging columns:   0%|          | 0/1000 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/1000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Extending conversations:   0%|          | 0/1000 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [37]:
chat_template = """Below are some instructions that describe some tasks. Write responses that appropriately complete each request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
)

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [38]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,  # Increase for better results
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Unsloth: We found double BOS tokens - we shall remove one automatically.


Tokenizing to ["text"] (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [39]:
print("Starting training...")
trainer_stats = trainer.train()
print("Training complete!")

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 12,615,680/628,221,952 (2.01% trained)


Step,Training Loss
1,2.0152
2,1.9797
3,2.084
4,1.9321
5,1.7344
6,1.7027
7,1.571
8,1.3368
9,1.2624
10,1.1727


Training complete!


In [44]:
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

print("Testing the model with a sample query...")
test_question = "How does listening to music contribute in health?"

messages = [
    {"role": "user", "content": test_question},
]
input_ids = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
).to("cuda")

Testing the model with a sample query...


In [45]:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    input_ids,
    streamer=text_streamer,
    max_new_tokens=128,
    pad_token_id=tokenizer.eos_token_id,
)

Improves memory, boosts problem-solving skills, and enhances cultural awareness.</s>


In [42]:
# Step 10: Save the model locally and upload to HuggingFace
print(f"Saving model locally and to HuggingFace as {HF_MODEL_ID}...")

# Save model locally
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# Push to HuggingFace Hub
model.push_to_hub(HF_MODEL_ID, token=HF_TOKEN)
tokenizer.push_to_hub(HF_MODEL_ID, token=HF_TOKEN)

Saving model locally and to HuggingFace as XoXoHarsh/others-assistant-tinyllama...


README.md:   0%|          | 0.00/584 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/50.5M [00:00<?, ?B/s]

Saved model to https://huggingface.co/XoXoHarsh/others-assistant-tinyllama


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

In [43]:
print("Creating GGUF versions...")

# This creates a Q8_0 quantized GGUF file (balance of quality and size)
model.push_to_hub_gguf(
    HF_MODEL_ID,
    tokenizer,
    quantization_method=["q8_0", "q4_k_m"],  # Both 8-bit and 4-bit quantized versions
    token=HF_TOKEN,
)

print(f"\nModel successfully uploaded to HuggingFace at {HF_MODEL_ID}")
print(f"You can view it at: https://huggingface.co/{HF_MODEL_ID}")
print("GGUF versions are available for download for local Ollama use")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Creating GGUF versions...


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 762.5M


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.69 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 22/22 [00:00<00:00, 32.04it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving XoXoHarsh/others-assistant-tinyllama/pytorch_model.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = True.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0', 'q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at XoXoHarsh/others-assistant-tinyllama into f16 GGUF format.
The output location will be /content/XoXoHarsh/others-assistant-tinyllama/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: others-assistant-tinyllama
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {2048, 320

unsloth.Q8_0.gguf:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/XoXoHarsh/others-assistant-tinyllama
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf:   0%|          | 0.00/668M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/XoXoHarsh/others-assistant-tinyllama


No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Saved Ollama Modelfile to https://huggingface.co/XoXoHarsh/others-assistant-tinyllama

Model successfully uploaded to HuggingFace at XoXoHarsh/others-assistant-tinyllama
You can view it at: https://huggingface.co/XoXoHarsh/others-assistant-tinyllama
GGUF versions are available for download for local Ollama use
