# Package Installation and Imports
Install required packages including unsloth and flash-attention, and import necessary libraries for the KTO finetuning process.

In [None]:
# Install required packages
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

# Import necessary libraries
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import os
import re
from typing import List, Literal, Optional
from datasets import load_dataset
from trl import KTOConfig, KTOTrainer

# Model Loading and Configuration
Load the pre-trained model and tokenizer using FastLanguageModel, and configure basic parameters like sequence length and quantization settings.

In [None]:
# Model Loading and Configuration

# Set basic parameters
max_seq_length = 4096  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.


# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-1.5B-Instruct",  # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Add proper chat template if missing
if tokenizer.chat_template is None:
    DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
    tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

==((====))==  Unsloth 2024.12.1: Fast Qwen2 patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Dataset Preparation and Processing
Load and prepare the KTO dataset, including applying chat templates and processing the data for training.

In [None]:
# Dataset Preparation and Processing


# Function to apply chat template
def apply_chat_template(
    example, tokenizer, task: Literal["sft", "generation", "rm", "kto"] = "sft", assistant_prefix="<|assistant|>\n"
):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        return re.sub(f"^{re.escape(pattern)}", "", s)

    if task in ["sft", "generation"]:
        messages = example["messages"]
        # We add an empty system message if there is none
        if messages[0]["role"] != "system":
            messages.insert(0, {"role": "system", "content": ""})
        example["text"] = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True if task == "generation" else False
        )
    elif task == "rm":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            chosen_messages = example["chosen"]
            rejected_messages = example["rejected"]
            # We add an empty system message if there is none
            if chosen_messages[0]["role"] != "system":
                chosen_messages.insert(0, {"role": "system", "content": ""})
            if rejected_messages[0]["role"] != "system":
                rejected_messages.insert(0, {"role": "system", "content": ""})
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
        else:
            raise ValueError(
                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # Compared to reward modeling, we filter out the prompt, so the text is everything after the last assistant token
            prompt_messages = [[msg for msg in example["chosen"] if msg["role"] == "user"][0]]
            # Insert system message
            if example["chosen"][0]["role"] != "system":
                prompt_messages.insert(0, {"role": "system", "content": ""})
            else:
                prompt_messages.insert(0, example["chosen"][0])
            # TODO: handle case where chosen/rejected also have system messages
            chosen_messages = example["chosen"][1:]
            rejected_messages = example["rejected"][1:]
            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True
            )
            example["text_chosen"] = _strip_prefix(example["text_chosen"], assistant_prefix)
            example["text_rejected"] = _strip_prefix(example["text_rejected"], assistant_prefix)
        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    elif task == "kto":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            prompt_messages = [[msg for msg in example["chosen"] if msg["role"] == "user"][0]]
            chosen_messages = prompt_messages + [msg for msg in example["chosen"] if msg["role"] == "assistant"]
            rejected_messages = prompt_messages + [msg for msg in example["rejected"] if msg["role"] == "assistant"]
            if "system" in example:
                chosen_messages.insert(0, {"role": "system", "content": example["system"]})
                rejected_messages.insert(0, {"role": "system", "content": example["system"]})
            example["text_chosen"] = _strip_prefix(tokenizer.apply_chat_template(chosen_messages, tokenize=False), assistant_prefix)
            example["text_rejected"] = _strip_prefix(tokenizer.apply_chat_template(rejected_messages, tokenize=False), assistant_prefix)
        else:
            raise ValueError(f"Could not format example as dialogue for `kto` task!")
    else:
        raise ValueError(
            f"Task {task} not supported, please ensure that the provided task is one of {['sft', 'generation', 'rm', 'dpo', 'kto']}"
        )
    return example

# Load the KTO dataset
raw_datasets = load_dataset("trl-lib/kto-mix-14k")
train_dataset = raw_datasets["train"]

# Take a subset of the training data, I only use 1000 examples here
train_subset = train_dataset.select(range(1000))  # Use first 1000 examples

# Model Training Setup
Configure the LoRA adapters and set up the KTO trainer with appropriate training arguments.

In [None]:
# Model Training Setup

# Configure the LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

# Set up the KTO trainer with appropriate training arguments
kto_trainer = KTOTrainer(
    model=model,
    args=KTOConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        learning_rate=5e-7,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        output_dir="outputs",
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        seed = 42,
        report_to="none", # Use this for WandB etc
    ),
    train_dataset=train_subset,
    #train_dataset=train_dataset,
    processing_class=tokenizer,
)


# Training Execution
Execute the training process with the configured trainer and monitor the training progress.

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
1.709 GB of memory reserved.


In [None]:
kto_trainer.train()

Step,Training Loss
1,0.5
2,0.5
3,0.4996
4,0.5016
5,0.4995
6,0.4994
7,0.5012
8,0.4993
9,0.5002
10,0.5022


# Model Saving and Export
Save the trained model in different formats including LoRA adapters and merged model.

In [None]:
# Model Saving and Export

# Local saving
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

# Save merged model as float16 or int4
if False: # Set to True to save
    model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit")
    # model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_4bit")
    # model.save_pretrained_merged("merged_model", tokenizer, save_method = "lora")

# Save to HuggingFace Hub
if False: # Set to True to save
    model.push_to_hub_merged("your_name/model", tokenizer, save_method = "merged_16bit", token = "...")
    # save_method can be "merged_16bit", "merged_4bit", or "lora"

# Save to GGUF format (for llama.cpp)
if False: # Set to True to save
    from transformers import AutoTokenizer
    model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit")
    !git clone https://github.com/ggerganov/llama.cpp
    !cd llama.cpp && make
    !python3 llama.cpp/convert.py merged_model/ --outfile model-unsloth.gguf
    # Also supports quantization
    !./llama.cpp/quantize model-unsloth.gguf model-unsloth-Q4_K_M.gguf Q4_K_M

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml",
    mapping = {"role": "role", "content": "content", "user": "user", "assistant": "assistant"},
)

FastLanguageModel.for_inference(model)

def generate_response(message):
    print("\n" + "="*50 + "\nQUESTION:\n" + "="*50)
    print(message + "\n")
    print("-"*50 + "\nRESPONSE:\n" + "-"*50)

    messages = [{"content": message, "role": "user"}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt"
    ).to("cuda")

from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
    outputs = model.generate(
        input_ids = inputs,
        streamer = text_streamer,
        temperature = 0.1,
        max_new_tokens = 1024,
        use_cache = True
    )
    return outputs

# Test questions
questions = [
    "Q:Question: how old julio cesar chavez when he fought de la hoya I found the following answer on Google: He holds records for most successful consecutive defenses of world titles (27), most title fights (37), most title-fight victories (31) and he is after Joe Louis with (23) for most title defenses won by knockout (21). Is that a correct answer? Yes or no.\nA:",
    "Q:Information: - The Assistant Secretary of Defense for Health Affairs (ASD(HA)) is chartered under United States Department of Defense Directive (DoDD) 5136.1 in 1994. This DoDD states that the ASD(HA) is the principal advisor to the U.S. Secretary of Defense on all \"DoD health policies, programs and activities.\" In addition to exercising oversight of all DoD health resources, ASD(HA) serves as director of the Tricare Management Activity. - The Department of the Air Force (DAF) is one of the three Military Departments within the Department of Defense of the United States of America. The Department of the Air Force was formed on September 18, 1947, per the National Security Act of 1947 and it includes all elements and units of the United States Air Force (USAF). - The Surgeon General of the Air Force is the senior-most Medical Service officer in the United States Department of the Air Force. In recent times, this has been a Lieutenant General who serves as head of the United States Air Force Medical Service (AFMS). The Surgeon General is usually the senior Medical Corps officer, but acting surgeons general have been from other branches of the medical service. - Lieutenant general, lieutenant-general and similar (abbrev Lt Gen, LTG and similar) is a three-star military rank (NATO code OF-8) used in many countries. The rank traces its origins to the Middle Ages, where the title of lieutenant general was held by the second in command on the battlefield, who was normally subordinate to a captain general. - The United States Air Force (USAF) is the aerial warfare service branch of the United States Armed Forces and one of the seven American uniformed services. Initially part of the United States Army, the USAF was formed as a separate branch of the military on 18 September 1947 under the National Security Act of 1947. It is the most recent branch of the U.S. military to be formed, and is the largest and one of the world's most technologically advanced air forces. The USAF articulates its core functions as Nuclear Deterrence Operations, Special Operations, Air Superiority, Global Integrated ISR, Space Superiority, Command and Control, Cyberspace Superiority, Personnel Recovery, Global Precision Attack, Building Partnerships, Rapid Global Mobility and Agile Combat Support. - Lieutenant General James Gordon Roudebush , USAF , ( born February 24 , 1948 ) was the 19th Surgeon General of the United States Air Force , Headquarters U.S. Air Force , Washington , D.C. General Roudebush served as functional manager of the U.S. Air Force Medical Service . In this capacity , he advised the Secretary of the Air Force and Air Force Chief of Staff , as well as the Assistant Secretary of Defense for Health Affairs on matters pertaining to the medical aspects of the air expeditionary force and the health of Air Force people . General Roudebush had authority to commit resources worldwide for the Air Force Medical Service , to make decisions affecting the delivery of medical services , and to develop plans , programs and procedures to support worldwide medical service missions . He exercised direction , guidance and technical management of more than 42,400 people assigned to 74 medical facilities worldwide . A native of Gering , Nebraska , Roudebush entered the Air Force in 1975 after receiving a Bachelor of Medicine degree from the University of Nebraska at Lincoln , and a Doctor of Medicine degree from the University of Nebraska College of Medicine . He completed residency training in family practice at the Wright - Patterson Air Force Medical Center , Ohio , in 1978 , and aerospace medicine at Brooks Air Force Base , Texas , in 1984 . He commanded a wing clinic and wing hospital before becoming Deputy Commander of the Air Force Materiel Command Human Systems Center . He has served as Command Surgeon for U.S. Central Command , Pacific Air Forces , U.S. Transportation Command and Headquarters Air Mobility Command . Prior to his selection as the 19th Surgeon General , he served as the Deputy Surgeon General of the U.S. Air Force . He retired from the U.S. Air Force on October 1 , 2009 . After reading the paragraphs above, choose the best answer for the entity that related to 'james g. roudebush' with the relationship of 'occupation'. Choices: - advisor - army - captain - general - lieutenant - military - officer - secretary - surgeon - united states of america\nA:",
    "If But slowly and doggedly he went on sawing to and fro., can we conclude that \"It was difficult to keep sawing.\"?",
    "You are given a list of queries separated by new line. Your job is to answer with the query that is the most well-formed or well-structured query in terms of grammar, punctuations, or spelling errors.\nQ: How do you set the alarm on the prospirit watch ?\nThe allies tried to regain access to the battle of Gallipoli ?\nWhat is scooter smith real phone number not a fake one ?\nLaw of Supply and Demand defined ?\nA:",
    "How does the sentence end? See options at the end\n\nThe woman tried to put the books on the couches but the \n\nAvailable options: - couches were too large. - books were too large.",
]
# Generate responses
for question in questions:
    generate_response(question)