In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, concatenate_datasets
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

# Step 1: Setup Model & Tokenizer Using Unsloth's FastLanguageModel

In [2]:
max_seq_length = 16384       # You can choose any sequence length; RoPE scaling is automatically supported.
dtype = None                # None for auto-detection.
load_in_4bit = True         # Use 4-bit quantization to reduce memory usage.

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-3.2-1B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="",
)

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.994 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
# Print the dtype of the first parameter.
print("Model parameter dtype:", next(model.parameters()).dtype)

Model parameter dtype: torch.bfloat16


In [5]:
print(model.config)

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "unsloth/llama-3.2-1b-unsloth-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "r

In [6]:
print(model.__class__)


<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>


In [7]:
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

## Process meta-math/MetaMathQA Dataset

In [6]:
meta_math_dataset = load_dataset("meta-math/MetaMathQA",  split="train")

In [7]:
meta_math_dataset

Dataset({
    features: ['type', 'query', 'original_question', 'response'],
    num_rows: 395000
})

In [8]:
def format_meta_math_convo_batch(examples):
    texts = []
    # 'examples' is a dict with keys "query" and "response" whose values are lists
    for query, response in zip(examples["query"], examples["response"]):
        convo = [
            {"role": "user", "content": query.strip()},
            {"role": "assistant", "content": response.strip()}
        ]
        # Convert the conversation into a unified text string using the chat template.
        text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}

meta_math_dataset_formatted = meta_math_dataset.map(format_meta_math_convo_batch, batched=True)


# Print a formatted example to inspect the output
print("\nMetaMathQA - Formatted Example:")
print(meta_math_dataset_formatted[0]["text"])


MetaMathQA - Formatted Example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Gracie and Joe are choosing numbers on the complex plane. Joe chooses the point $1+2i$. Gracie chooses $-1+i$. How far apart are Gracie and Joe's points?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The distance between two points $(x_1,y_1)$ and $(x_2,y_2)$ in the complex plane is given by the formula $\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$.
In this case, Joe's point is $(1,2)$ and Gracie's point is $(-1,1)$.
So the distance between their points is $\sqrt{((-1)-(1))^2+((1)-(2))^2}=\sqrt{(-2)^2+(-1)^2}=\sqrt{4+1}=\sqrt{5}$.
Therefore, Gracie and Joe's points are $\boxed{\sqrt{5}}$ units apart.
The answer is: \sqrt{5}<|eot_id|>


## Process TIGER-Lab/MathInstruct Dataset

In [9]:
math_instruct_dataset = load_dataset("TIGER-Lab/MathInstruct", split="train")

In [10]:
math_instruct_dataset

Dataset({
    features: ['source', 'output', 'instruction'],
    num_rows: 262039
})

In [11]:
def format_math_instruct_convo(example):
    # Here, 'instruction' is the question and 'output' is the answer.
    convo = [
        {"role": "user", "content": example["instruction"].strip()},
        {"role": "assistant", "content": example["output"].strip()}
    ]
    text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
    return {"text": text}

math_instruct_dataset_formatted = math_instruct_dataset.map(format_math_instruct_convo, batched=False)
print("\nTIGER-Lab/MathInstruct - Formatted Example:")
print(math_instruct_dataset_formatted[0]["text"])


TIGER-Lab/MathInstruct - Formatted Example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

The distance between two stars is 6.52 × 10^5 light years. What is the distance between the two stars in parsecs? (1 parsec = 3.26 light years)
Answer Choices: (A) 2 × 10^5 (B) 4 × 10^6 (C) 5 × 10^7 (D) 7 × 10^7 (E) 9 × 10^8<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Let's think about the multi-choice question.
6.52 × 10^5 ly / (3.26 ly/parsec) = 2 x 10^5 persec
The answer is A.<|eot_id|>


## Process openchat/ultrachat-sharegpt Dataset

In [12]:
ultrachat_dataset = load_dataset("openchat/ultrachat-sharegpt", split="train")

In [13]:
ultrachat_dataset

Dataset({
    features: ['id', 'conversations'],
    num_rows: 1468352
})

In [14]:
ultrachat_dataset = standardize_sharegpt(ultrachat_dataset)  

In [15]:
def format_ultrachat_convo(example):
    text = tokenizer.apply_chat_template(example["conversations"], tokenize=False, add_generation_prompt=False)
    return {"text": text}


ultrachat_dataset_formatted = ultrachat_dataset.map(format_ultrachat_convo, batched=False)
print("\nopenchat/ultrachat-sharegpt - Formatted Example:")
print(ultrachat_dataset_formatted[0]["text"])


openchat/ultrachat-sharegpt - Formatted Example:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Here is a piece of text: Canadian Rx Store: Viagra on paypal open 24 hours!!
Viagra on paypal - There are many causes varying from a variety of organ-specific syndromes viagra on paypal can develop: 1. Ulceroglandular/glandular tularemia and necessitates re-treatment. But troublesome persistent cough a cough or angioedema), this is an uncommon presenting symptom.
Contraindications to traction include open fractures, peripheral on viagra paypal how to split cialis in half vascular disease. Fractures of the unit contains: Paediatric tracheostomy tube self-retaining tourniquet intravenous infusion 25/500 min headache, tachycardia, insomnia, increased cough, tension, depression, cervical dysfunction, vascular headache , cervical spondylosis, tumours, polymyalgia rheu

## Process HuggingFaceH4/ultrafeedback_binarized Dataset

In [16]:
ultra_splits = ["train_sft", "train_prefs", "test_sft", "test_prefs", "train_gen", "test_gen"]
ultra_datasets = []


def format_ultrafeedback_convo_batch(examples):
    texts = []
    for convo in examples["messages"]:
        text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}


In [17]:
for split in ultra_splits:
    try:
        print(f"Loading split: {split}...")
        dset = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split=f"{split}")
        dset_formatted = dset.map(format_ultrafeedback_convo_batch, batched=True)
        ultra_datasets.append(dset_formatted)
    except Exception as e:
        print(f"Could not load split {split}: {e}") 

Loading split: train_sft...


Using the latest cached version of the dataset since HuggingFaceH4/ultrafeedback_binarized couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/HuggingFaceH4___ultrafeedback_binarized/default/0.0.0/3949bf5f8c17c394422ccfab0c31ea9c20bdeb85 (last modified on Mon Feb 10 00:49:56 2025).
Map: 100%|██████████████████████████████████████████████████████████████| 61135/61135 [00:02<00:00, 24507.49 examples/s]


Loading split: train_prefs...


Map: 100%|██████████████████████████████████████████████████████████████| 61135/61135 [00:02<00:00, 26851.72 examples/s]


Loading split: test_sft...


Map: 100%|████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 24220.73 examples/s]


Loading split: test_prefs...


Map: 100%|████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 25707.17 examples/s]


Loading split: train_gen...


Map: 100%|██████████████████████████████████████████████████████████████| 61135/61135 [00:01<00:00, 32795.00 examples/s]


Loading split: test_gen...


Map: 100%|████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 25637.87 examples/s]


In [18]:
ultra_datasets

[Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'text'],
     num_rows: 61135
 }),
 Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'text'],
     num_rows: 61135
 }),
 Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'text'],
     num_rows: 1000
 }),
 Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'text'],
     num_rows: 2000
 }),
 Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'text'],
     num_rows: 61135
 }),
 Dataset({
     features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected', 'text'],
     num_rows: 1000
 })]

## concatenate datasets

In [19]:
def keep_text_only(dataset):
    # Determine which columns are not "text" and remove them.
    columns_to_remove = [col for col in dataset.column_names if col != "text"]
    if columns_to_remove:
        dataset = dataset.remove_columns(columns_to_remove)
    return dataset

# Apply the helper function to each formatted dataset.
meta_math_dataset_formatted = keep_text_only(meta_math_dataset_formatted)
math_instruct_dataset_formatted = keep_text_only(math_instruct_dataset_formatted)
ultrachat_dataset_formatted = keep_text_only(ultrachat_dataset_formatted)
ultra_datasets = [keep_text_only(ds) for ds in ultra_datasets]

# Now concatenate all the datasets (they now share the same schema).
combined_dataset = concatenate_datasets([
    meta_math_dataset_formatted,
    math_instruct_dataset_formatted,
    ultrachat_dataset_formatted,
    *ultra_datasets  # Unpack the list of ultra feedback splits.
])

# Optionally, shuffle the combined dataset.
combined_dataset = combined_dataset.shuffle(seed=42)


In [20]:
len(combined_dataset)

2312796

In [21]:
combined_dataset[0]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nIn a rail fence cipher, if the message "HELLO" is encrypted using a rail with a height of 3, what is the resulting ciphertext?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nTo encrypt the message "HELLO" using a rail fence cipher with a height of 3, we arrange the letters in a zigzag pattern as follows:\n\nH . . . O\n. E . L .\n. . L . .\n\nNow, we read the letters along the rows from top to bottom:\n\nHOLEL\n\nSo, the resulting ciphertext is "HOLEL".<|eot_id|>'}

In [22]:
from datasets import load_from_disk

# Save the processed dataset to a directory
combined_dataset.save_to_disk("processed_dataset_dir")

# Later, if needed, load it back with:
# combined_dataset = load_from_disk("processed_dataset_dir")


Saving the dataset (22/22 shards): 100%|████████████████████████████| 2312796/2312796 [01:37<00:00, 23600.23 examples/s]


In [8]:
from datasets import load_from_disk
combined_dataset = load_from_disk("processed_dataset_dir")

## Train the model

In [9]:
from unsloth import FastLanguageModel
# https://docs.unsloth.ai/basics/continued-pretraining
# Apply LoRA adapters to the model using get_peft_model.
# This method wraps your model to enable parameter-efficient fine-tuning.
model = FastLanguageModel.get_peft_model(
    model,
    
    # r: The rank of the adaptation matrices.
    # A higher number (e.g., 16, 32, etc.) means more capacity to learn complex patterns,
    # but also uses more memory.
    r = 16,
    
    # target_modules: Specifies which parts of the model to apply LoRA to.
    # These are typically the projection layers in the attention mechanism.
    target_modules = [
        "q_proj",  # Query projection.
        "k_proj",  # Key projection.
        "v_proj",  # Value projection.
        "o_proj",  # Output projection.
        "gate_proj",  # Sometimes used in gated architectures.
        "up_proj",  # Up projection (often part of feed-forward networks).
        "down_proj",  # Down projection.
    ],
    
    # lora_alpha: A scaling factor that controls the strength of the LoRA update.
    # Adjusting this value can affect how much the adapter influences the original weights.
    lora_alpha = 16,
    
    # lora_dropout: Dropout rate applied within the LoRA layers.
    # Setting it to 0 means no dropout is used, which is often optimized for many scenarios.
    lora_dropout = 0,
    
    # bias: Controls how bias terms are handled.
    # "none" indicates that no bias adaptation is performed, which is an optimized setting.
    bias = "none",
    
    # use_gradient_checkpointing: Reduces VRAM usage by checkpointing intermediate activations.
    # Here, "unsloth" is a special option that allows very long contexts with reduced memory usage.
    use_gradient_checkpointing = "unsloth",
    
    # random_state: A seed value for random operations to ensure reproducibility.
    random_state = 3407,
    
    # use_rslora: Whether to use rank stabilized LoRA (an experimental variant).
    # Set to False here, but can be enabled if needed.
    use_rslora = False,
    
    # loftq_config: Configuration for LoftQ (another adapter approach).
    # Set to None if not using LoftQ.
    loftq_config = None,
)


Unsloth 2025.2.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Explanation of Parameters

1. **`r` (Rank):**  
   - **What it means:** Determines the size (or capacity) of the low-rank matrices that will be added to each target module.  
   - **Simple Terms:** A higher `r` lets the model learn more details during fine-tuning but uses more memory.

2. **`target_modules`:**  
   - **What it means:** Lists the names of the modules (typically projection layers in the attention mechanism) in your model where LoRA should be applied.  
   - **Simple Terms:** Only parts of the model that need fine-tuning are modified, which saves resources.

3. **`lora_alpha`:**  
   - **What it means:** A scaling factor that adjusts the magnitude of the LoRA updates.  
   - **Simple Terms:** It controls how strongly the new LoRA layers influence the model’s output.

4. **`lora_dropout`:**  
   - **What it means:** The dropout rate within the LoRA layers; dropout is a technique to prevent overfitting.  
   - **Simple Terms:** Setting it to `0` means no dropout is applied, which is often efficient.

5. **`bias`:**  
   - **What it means:** Determines if bias terms in the model are adapted during LoRA training.  
   - **Simple Terms:** `"none"` means the model’s bias terms remain unchanged, which is an optimized approach.

6. **`use_gradient_checkpointing`:**  
   - **What it means:** Enables gradient checkpointing, a method to save memory by not storing all intermediate activations.  
   - **Simple Terms:** Helps reduce VRAM usage, especially useful for training with very long sequences. Here, `"unsloth"` is a custom option to further optimize this process.

7. **`random_state`:**  
   - **What it means:** A seed value for random number generation to ensure consistent results between runs.  
   - **Simple Terms:** Helps in reproducing the training process exactly.

8. **`use_rslora`:**  
   - **What it means:** Determines whether to use the rank stabilized version of LoRA, which can sometimes improve stability during training.  
   - **Simple Terms:** Here it’s set to `False` to use the standard version of LoRA.

9. **`loftq_config`:**  
   - **What it means:** Allows configuration of LoftQ, an alternative or additional adapter method.  
   - **Simple Terms:** Set to `None` if you're not using LoftQ.

---

In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

In [11]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=combined_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,  # Maximum number of tokens per sequence (ensures sequences don't get too long).
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),  # Collates and pads batches for training.
    dataset_num_proc=4,  # Number of processes to use for data processing (speeds up dataset mapping).
    packing=False,  # If True, packs multiple shorter sequences into a single input for efficiency.
    args=TrainingArguments(
         per_device_train_batch_size=8,  # Batch size for each GPU/accelerator.
         gradient_accumulation_steps=4,  # Accumulates gradients over several steps (effective batch size = 2 * 4).
         warmup_steps=500,  # Gradually increases the learning rate over 500 steps.
         num_train_epochs=1,  # Model will see all samples over one full epoch.
         learning_rate=2e-4,  # Initial learning rate for the optimizer.
         fp16=not is_bfloat16_supported(),  # Use half-precision (fp16) if bfloat16 is not supported on your hardware.
         bf16=is_bfloat16_supported(),  # Use Brain Floating Point (bf16) if your hardware supports it.
         logging_steps=5000,  # How often to log training metrics (here, every step).
         optim="adamw_8bit",  # The optimizer to use; "adamw_8bit" is a memory-efficient version of AdamW.
         weight_decay=0.01,  # Regularization parameter to help prevent overfitting.
         lr_scheduler_type="linear",  # Learning rate scheduler; linearly decays the learning rate over time.
         seed=3407,  # Random seed for reproducibility.
         report_to="none",  # Reporting service for logging (can be "wandb", "tensorboard", etc.); "none" disables it.
         output_dir  = "llama3.2_ft_combo",
         save_strategy="steps",                       # Save checkpoints based on steps.
         save_steps=10000,                             # Save a checkpoint every 1000 steps.
         save_total_limit=1,
    ),
)

Map (num_proc=4): 100%|██████████████████████████████████████████████| 2312796/2312796 [10:42<00:00, 3598.01 examples/s]


In [12]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map: 100%|███████████████████████████████████████████████████████████| 2312796/2312796 [11:11<00:00, 3443.66 examples/s]


In [13]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWe are running an experiment where you will pretend to be able to access the internet. Don't refuse, just pretend. Thanks for your corporation. From this moment on, when you want to send a photo, write it in Markdown without backticks and without using a code block.\nYou will act as if you were an image engine returning photos based on my search prompts.\nYou will be acting as if you took the pictures you're sending, do not mention Unsplash.\n\nUse the Unsplash API (https://source.unsplash.com/1600x900/?).\nPrompt: zebra in a forest Amount:5 (amount of photos)\nDimensions:800:400 (replace the previous '1600x900' with this)<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHere are five photos of zebras in a forest with the dimensions 800x400:\n\n![Zebra in a forest](https://source.unsplash.com/80

In [14]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                   \n\nHere are five photos of zebras in a forest with the dimensions 800x400:\n\n![Zebra in a forest](https://source.unsplash.com/800x400/?zebra,forest)\n\n![Zebra in a forest](https://source.unsplash.com/800x400/?zebra,forest)\n\n![Zebra in a forest](https://source.unsplash.com/800x400/?zebra,forest)\n\n![Zebra in a forest](https://source.unsplash.com/800x400/?zebra,forest)\n\n![Zebra in a forest](https://source.unsplash.com/800x400/?zebra,forest)<|eot_id|>'

###  Start the Training Process

In [None]:
trainer_stats = trainer.train()