In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

qwen_models = [
    "unsloth/Qwen2.5-Coder-32B-Instruct",      # Qwen 2.5 Coder 2x faster
    "unsloth/Qwen2.5-Coder-7B",
    "unsloth/Qwen2.5-14B-Instruct",            # 14B fits in a 16GB card
    "unsloth/Qwen2.5-7B",
    "unsloth/Qwen2.5-72B-Instruct",            # 72B fits in a 48GB card
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-1.5B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Unsloth: Patching Xformers to fix some performance issues.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the `Qwen-2.5` format for conversation style finetunes. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Qwen renders multi turn conversations like below:

```
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 2+2?<|im_end|>
<|im_start|>assistant
It's 4.<|im_end|>

```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3` and more.

In [None]:
def create_conversation_format(examples):
    output_texts = []
    for i in range(len(examples["user_prompt"])):
        user_prompt = examples["user_prompt"][i]
        expected_response = examples["expected_response"][i]
        # Qwen-2.5 doesn't strictly require a system prompt, but we can add one if desired.
        # Let's stick to the user/assistant turns for simplicity based on our data.
        output_texts.append(
            [
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": expected_response},
            ]
        )
    return {"conversations": output_texts}

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    # Apply the chat template. add_generation_prompt=False means no trailing <|im_start|>assistant token is added.
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass


import json
from datasets import load_dataset

# Load the custom dataset from the JSON file
# Use load_dataset with 'json' type and specify the local file path
try:
    dataset = load_dataset("json", data_files="data.json", split="train")
    print("Dataset loaded successfully:")
    print(dataset)
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise e

# Apply the function to create the "conversations" column
dataset = dataset.map(create_conversation_format, batched=True, remove_columns=list(dataset.features)) # Remove original columns

# Apply the chat template formatting function
dataset = dataset.map(formatting_prompts_func, batched=True,)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully:
Dataset({
    features: ['user_prompt', 'expected_response'],
    num_rows: 114
})


Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

We now use `standardize_sharegpt` to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:
```
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What is 2+2?"}
{"from": "gpt", "value": "It's 4."}
```
to
```
{"role": "system", "content": "You are an assistant"}
{"role": "user", "content": "What is 2+2?"}
{"role": "assistant", "content": "It's 4."}
```

In [5]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

We look at how the conversations are structured for item 0:

In [6]:
if len(dataset) > 0:
    print("Original conversation structure (item 0):")
    print(dataset[0]["conversations"])
else:
    print("Dataset is empty after processing.")

Original conversation structure (item 0):
[{'content': 'Write the standard header for a simple 6502 program, including basic IO macros, constants, and the program start structure.', 'role': 'user'}, {'content': '; Standard 6502 Program Header and Structure\n\n; --- Constant Definitions (using =) ---\nNEWLINE = $0A      ; ASCII code for Line Feed\nNULL_TERM = $00    ; Null terminator for strings\nMAX_LOOPS = 5      ; Example constant for loop count\nTRUE = $FF         ; Define true value\nFALSE = $00        ; Define false value\n\n; --- Opcode Definitions (Simulated I/O) ---\n; These represent system calls or hardware interaction\nprintnum_opcode   = $72 ; Prints number in Accumulator (Decimal)\nprintchar_opcode  = $B2 ; Prints character in Accumulator (ASCII)\nendprog_opcode    = $D2 ; Halts program execution\n\n; --- Standard Macros ---\n.macro printnum\n    .byte printnum_opcode\n.endmacro\n.macro printchar\n    .byte printchar_opcode\n.endmacro\n.macro endprog\n    .byte endprog_opc

And we see how the chat template transformed these conversations.

In [7]:
if len(dataset) > 0:
    print("\nFormatted text with chat template (item 0):")
    print(dataset[0]["text"])
else:
    print("Dataset is empty, cannot show formatted text.")


Formatted text with chat template (item 0):
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Write the standard header for a simple 6502 program, including basic IO macros, constants, and the program start structure.<|im_end|>
<|im_start|>assistant
; Standard 6502 Program Header and Structure

; --- Constant Definitions (using =) ---
NEWLINE = $0A      ; ASCII code for Line Feed
NULL_TERM = $00    ; Null terminator for strings
MAX_LOOPS = 5      ; Example constant for loop count
TRUE = $FF         ; Define true value
FALSE = $00        ; Define false value

; --- Opcode Definitions (Simulated I/O) ---
; These represent system calls or hardware interaction
printnum_opcode   = $72 ; Prints number in Accumulator (Decimal)
printchar_opcode  = $B2 ; Prints character in Accumulator (ASCII)
endprog_opcode    = $D2 ; Halts program execution

; --- Standard Macros ---
.macro printnum
    .byte printnum_opcode
.endmacro
.macro pr

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

The trainer includes our **gradient accumulation bug fix**. Read more about it here: [Blog post](https://unsloth.ai/blog/gradient)

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Fixed major bug in latest Unsloth
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 30,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "paged_adamw_8bit", # Save more memory
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/114 [00:00<?, ? examples/s]

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [9]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

Map (num_proc=2):   0%|          | 0/114 [00:00<?, ? examples/s]

We verify masking is actually done:

In [10]:
# Ensure the dataset has items before checking masking
if len(trainer.train_dataset) > 0:
    print("Input IDs (decoded, item 0):")
    print(tokenizer.decode(trainer.train_dataset[0]["input_ids"]))

    print("\nLabels (decoded, -100 replaced with space, item 0):")
    space = tokenizer(" ", add_special_tokens = False).input_ids[0]
    print(tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]]))
else:
    print("Train dataset is empty, cannot verify masking.")

import torch
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

Input IDs (decoded, item 0):
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Write the standard header for a simple 6502 program, including basic IO macros, constants, and the program start structure.<|im_end|>
<|im_start|>assistant
; Standard 6502 Program Header and Structure

; --- Constant Definitions (using =) ---
NEWLINE = $0A      ; ASCII code for Line Feed
NULL_TERM = $00    ; Null terminator for strings
MAX_LOOPS = 5      ; Example constant for loop count
TRUE = $FF         ; Define true value
FALSE = $00        ; Define false value

; --- Opcode Definitions (Simulated I/O) ---
; These represent system calls or hardware interaction
printnum_opcode   = $72 ; Prints number in Accumulator (Decimal)
printchar_opcode  = $B2 ; Prints character in Accumulator (ASCII)
endprog_opcode    = $D2 ; Halts program execution

; --- Standard Macros ---
.macro printnum
    .byte printnum_opcode
.endmacro
.macro printchar
    .byt

We can see the System and Instruction prompts are successfully masked!

In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
1.711 GB of memory reserved.


We fixed a major gradient accumulation bug in all trainers. See [blog](https://unsloth.ai/blog/gradient) for more details.

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 114 | Num Epochs = 1 | Total steps = 28
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 36,929,536/5,000,000,000 (0.74% trained)


Step,Training Loss
1,1.8435
2,1.6514
3,1.7958
4,1.7497
5,1.4157
6,1.426
7,1.2237
8,1.2437
9,0.9533
10,1.0287


Unsloth: Will smartly offload gradients to save VRAM!


In [13]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

59.2991 seconds used for training.
0.99 minutes used for training.
Peak reserved memory = 2.926 GB.
Peak reserved memory for training = 1.215 GB.
Peak reserved memory % of max memory = 19.849 %.
Peak reserved memory for training % of max memory = 8.242 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

 We use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [18]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
  {"role": "user", "content": "Write me a simple 6502 assembly program that computes 21 + 9"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1280,
                   use_cache = True, temperature = 0.3, min_p = 0.9)

; Add Two Numbers (6502 Assembly)

; --- Opcode Definitions ---
printnum_opcode   = $72
printchar_opcode  = $B2
endprog_opcode    = $D2

; --- Macros ---
.macro printnum
    .byte printnum_opcode
.endmacro
.macro printchar
    .byte printchar_opcode
.endmacro
.macro endprog
    .byte endprog_opcode
.endmacro

        .org $0801
start:
    LDX #$FF           ; Initialize stack pointer
    TXS
    LDA #21            ; Load A with 21
    ADC #9             ; Add 9 to A (21 + 9 = 30)
    printnum           ; Print 30
    endprog

; No data storage needed in program area
        .end<|im_end|>


In [21]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [
    {"role": "user", "content": "What 6502 instructions can be used for addition?"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 640,
                   use_cache = True, temperature = 0.2, min_p = 0.9)

The 6502 instruction set includes several instructions designed for performing arithmetic operations, including addition. These include:

- `ADC (Add with Carry)`: Adds the value of memory at the specified address to the accumulator and sets the carry flag if the sum exceeds 255.
- `SBC (Subtract with Carry)`: Subtracts the value of memory at the specified address from the accumulator and sets the carry flag if the result is negative or zero.

These instructions are essential for implementing basic arithmetic operations in computer programs that use the 6502 architecture.<|im_end|>


### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)

In [15]:
# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.47 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 31.17it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model.bin...
Done.


Unsloth: Converting qwen2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at model into q8_0 GGUF format.
The output location will be /content/model/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> Q8_0, shape = {1536, 151936}
INFO:hf-to-gguf:blk.0.attn_q.bias,         torch.float16 --> F32, shape = {1536}
INFO:h

100%|██████████| 28/28 [00:00<00:00, 57.10it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model/pytorch_model.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be /content/model/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,         torch.float16 --> F16, shape = {1536, 151936}
INFO:hf-to-gguf:blk.0.attn_q.bias,         torch.float16 --> F32, sha