In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.6: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.1.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [4]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("Nitral-AI/Creative_Writing-ShareGPT", split = "train")

README.md:   0%|          | 0.00/534 [00:00<?, ?B/s]

Creative_Writing-ShareGPT.jsonl:   0%|          | 0.00/27.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6653 [00:00<?, ? examples/s]

In [5]:
dataset

Dataset({
    features: ['conversations'],
    num_rows: 6653
})

In [6]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Standardizing format:   0%|          | 0/6653 [00:00<?, ? examples/s]

Map:   0%|          | 0/6653 [00:00<?, ? examples/s]

In [8]:
dataset[0]['text']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI\'m looking for story writing help. For a horror story.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI can assist with horror story writing. What specific aspects would you like help with? Developing plot points, creating unsettling atmosphere, building suspense, crafting scary scenes? Let me know the details of what you need and I\'ll provide guidance to the best of my abilities, while keeping the content within appropriate boundaries.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI would like help with atmosphere and descriptions of scary scenes.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nTo create an unsettling atmosphere and describe scary scenes effectively:\n\n- Use sensory details to immerse the reader. Describe eerie sounds like creaking floorboards, howling wi

### Model Training

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

Map (num_proc=2):   0%|          | 0/6653 [00:00<?, ? examples/s]

In [10]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/6653 [00:00<?, ? examples/s]

In [11]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHey Claude, please write a very short horror story in 2-3 sentences about a man who is afraid of his own reflection.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nJohn never liked mirrors, but lately his reflection had begun to terrify him. He swore that sometimes, out of the corner of his eye, he would catch his reflection moving on its own, its face twisting into a sinister sneer. One night, as he brushed his teeth, he glanced up and saw his reflection grinning at him with sharp, jagged teeth - and in that moment, he knew it was no longer his own face staring back.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nPlease create a 3-sentence story about a teenage girl who lives in a dark, creepy mansion all alone after her father mysteriously died, but she plays with possessed dolls in th

In [12]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

"                                                             \n\nJohn never liked mirrors, but lately his reflection had begun to terrify him. He swore that sometimes, out of the corner of his eye, he would catch his reflection moving on its own, its face twisting into a sinister sneer. One night, as he brushed his teeth, he glanced up and saw his reflection grinning at him with sharp, jagged teeth - and in that moment, he knew it was no longer his own face staring back.<|eot_id|>                                               \n\nLily's footsteps echoed through the empty halls of the mansion, the only sound in the deafening silence that had overtaken her life since her father's unexplained death. She climbed the creaky stairs up to the attic, the one place that still brought her some solace, even if her only companions were the eerie, porcelain dolls that lined the shelves. As she reached for her favorite doll, its eyes suddenly snapped open, glassy and soulless, and Lily knew in that

In [13]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.543 GB of memory reserved.


In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,653 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.4496
2,2.3937
3,2.7741
4,2.4827
5,1.5859
6,2.2887
7,2.4999
8,2.3572
9,1.9501
10,2.7009


In [15]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

755.2928 seconds used for training.
12.59 minutes used for training.
Peak reserved memory = 3.895 GB.
Peak reserved memory for training = 1.352 GB.
Peak reserved memory % of max memory = 26.41 %.
Peak reserved memory for training % of max memory = 9.167 %.


### Inference

In [16]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "generate a poem on the theme of universe and life"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\ngenerate a poem on the theme of universe and life<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nIn the cosmic dance, a song unfolds \nA rhythm echoing, as the universe unfolds\nStars, galaxies spinning, cosmic energy abounds \nA cosmic ballet, in an ever-revolved sound \n\nPlanets orbit, a symphony to tell \nOf life, love and creation's intricate spell\nIn every atom,"]

In [17]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181.<|eot_id|>


### Saving and loading the fine-tune models

In [18]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [19]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Write a beautiful poem about India"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Amidst the whispers of ancient days
Golden hues of sunsets fade
Rays on temples, mystic tales
Echoes in the heart that never fails

In spice-scented cities alive
Flavors dance upon eager lips
Coffee brews in steamy lanes
Sounds of markets in each turn

From Ganges' sacred flowing flow
To Taj's grandeur in morning's glow
Hindu steps along spiritual paths
Intricate patterns weave vibrant aftermath

Rajput warriors, chivalrous and bold
Elegantly regale stories told
Maharajas rule with majesty
Vines ent


In [None]:
from google.colab import userdata

model.push_to_hub_gguf(
        "sai-santhosh/creative-writing-model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = userdata.get("HF_TOKEN") , # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 3.64 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:01<00:00, 17.48it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving sai-santhosh/creative-writing-model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving sai-santhosh/creative-writing-model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at sai-santhosh/creative-writing-model into f16 GGUF format.
The output location will be /content/sai-santhosh/creative-writing-model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: creative-writing-model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_m

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/sai-santhosh/creative-writing-model
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/sai-santhosh/creative-writing-model
Unsloth: Uploading GGUF to Huggingface Hub...


  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q5_K_M.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/sai-santhosh/creative-writing-model


No files have been modified since last commit. Skipping to prevent empty commit.


Saved Ollama Modelfile to https://huggingface.co/sai-santhosh/creative-writing-model


In [9]:

from unsloth import FastLanguageModel
model_x, tokenizer_x = FastLanguageModel.from_pretrained(
        model_name = "sai-santhosh/creative-writing-model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )
FastLanguageModel.for_inference(model_x) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Write a beautiful poem about India"},
]
inputs = tokenizer_x.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer_x, skip_prompt = True)
_ = model_x.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 256,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

==((====))==  Unsloth 2025.1.6: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Unsloth 2025.1.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Land of a million hues divine,
Where spices dance and gods entwine,
From the Ganges' sacred streams to the Indus' mighty flow,
India weaves its tale, a beauty that only love can know.

Her streets, a tapestry of colors bright and bold,
With vibrant saris, patterns bold,
The scent of spices fills the air,
A heady mix of cardamom and rose beyond compare.

From the majestic Taj to the Red Forts ancient stand,
Mumbai's bustling streets with energy of a vibrant land,
In Kerala's backwaters, gentle lapping of the tide,
In the mountains of the Himalayas, snow-capped peaks abide.

Her music whispers ancient secrets sweet,
The Raga's ancient melodies that the gods can't be beat,
The scent of street vendors, selling fruits and flowers bright,
India's alchemical essence, weaving the senses through the night.

In her temples, stories of gods and demons unfold,
Of Lord Ganesha's wisdom, the Durga's strength so bold,
Of Buddha's teachings, of Ramayana's eternal quest,
A land of tales, of legends and

In [21]:
%%writefile creative_writing_generator_1.py
import logging
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

class CreativeWritingGenerator:
    def __init__(self):
        self.model, self.tokenizer_x = FastLanguageModel.from_pretrained(
            model_name="sai-santhosh/creative-writing-model",
            max_seq_length=2048,
            dtype=None,
            load_in_4bit=True,
        )
        FastLanguageModel.for_inference(self.model)  # Enable native 2x faster inference
        self.logger = logging.getLogger()

        if not torch.cuda.is_available():
            self.logger.warning("CUDA is not available. Please connect to a GPU to utilize the model")

    def generate(self, text, new_tokens=256):
        messages = [
            {"role": "user", "content": f"{text}"}
        ]

        inputs = self.tokenizer_x.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,  # Must add for generation
            return_tensors="pt",
        ).to("cuda")

        text_streamer = TextStreamer(self.tokenizer_x, skip_prompt=True)
        _ = self.model.generate(
            input_ids=inputs,
            streamer=text_streamer,
            max_new_tokens=new_tokens,
            use_cache=True,
            temperature=1.5,
            min_p=0.1,
        )


Writing creative_writing_generator_1.py


In [22]:
from creative_writing_generator_1 import CreativeWritingGenerator

generator = CreativeWritingGenerator()
generator.generate("Write a fictional story about space explorations",new_tokens = 1024)

==((====))==  Unsloth 2025.1.6: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Captain Orion gazed out at the endless void of the cosmos from the cockpit of the Stardust Explorer, feeling the weight of millennia past bear down on him. The mission had started 500 lightyears from Earth, when his generation of space travelers had decided the universe was ripe for exploitation, but Orion's father had warned, the stars would never be the same.

The stars werent the same. They had grown dim and faint, as if they too could hear the cry of a world long lost, and as the Explorer cut deeper into the void, it wasnt the glow that