# Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
import random
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass

# PyTorch
import torch

# Huggingface
import huggingface_hub
from transformers import TextStreamer
from datasets import load_dataset, load_from_disk
from trl import SFTTrainer, SFTConfig

# Weights & Biases
import wandb

# Unsloth
from unsloth import FastLanguageModel, FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Configuration

In [28]:
@dataclass
class CONFIG:
    debug: bool = True
    
    # Model
    model_id: str = "meta-llama/Llama-3.2-3B-Instruct"
    model_type: str = "language"  # vision | language
    
    # HuggingFace Hub
    username: str = "PathFinderKR"
    model_name: str = f"KHU-Llama-3.2-3B-Instruct-SFT"
    
    # Data
    dataset_id: str = "mlabonne/FineTome-100k"  # "yahma/alpaca-cleaned"
    dataset_template: str = "chat"  # alpaca | chat
    
    # Training
    ## Paths
    output_dir: str = "./results"
    logging_dir: str = "./logs"
    save_strategy: str = "epoch"
    logging_strategy: str = "steps"
    logging_steps: int = 10
    save_total_limit: int = 1
    report_to: str = "wandb" if not debug else None
    ## Hyperparameters
    num_train_epochs: int = 1
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    fp16: bool = not is_bf16_supported()
    bf16: bool = is_bf16_supported()
    dtype: torch.dtype = torch.bfloat16 if is_bf16_supported() else torch.float16
    load_in_4bit: bool = True
    learning_rate: float = 2e-5
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.1
    optim: str = "adamw_8bit"
    weight_decay: float = 0.01
    max_seq_length: int = 2048
    dataset_num_proc: int = 2
    packing: bool = True
    ### LoRA
    lora: bool = True
    if lora:
        r: int = 16
        target_modules = ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"]
        lora_alpha: int = 32
        lora_dropout: float = 0
        bias: str = "none"
        use_gradient_checkpointing: str = "unsloth"
        use_rslora: bool = False
        loftq_config: str = None
        save_method: str = "merged_16bit"
    
    # Inference
    max_new_tokens: int = 2048
    do_sample: bool = True
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    
    # Device
    device: torch.device = None
    
    # Seed
    seed: int = 42

## Reproducibility

In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

Seed: 42


## Device

In [4]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

> Running on GPU | Num of GPUs:  1


## Debugging

In [5]:
if CONFIG.debug:
    CONFIG.num_train_epochs = 1

## HuggingFace

In [6]:
load_dotenv()
huggingface_hub.login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=True
)

## Weights & Biases

In [7]:
if not CONFIG.debug:
    wandb.login(
        key=os.getenv("WANDB_API_KEY")
    )
    wandb.init(
        project=CONFIG.model_name
    )

# Utility Functions

In [21]:
# Template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

llama_3_instruct_prompt = """<|start_header_id|>system<|end_header_id|>

{}<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}"""

# Formatting functions
def apply_alpaca_template(examples):
    texts = []
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = tokenizer.bos_token + alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

def apply_llama_template(examples):
    texts = []
    for conversation in examples:
        system = ""
        user = ""
        assistant = ""
        for message in conversation:
            if message['from'] == 'system':
                system = message['value']
            elif message['from'] == 'human':
                user = message['value']
            elif message['from'] == 'gpt':
                assistant = message['value']
        if CONFIG.model_type == "language":
            text = tokenizer.bos_token + llama_3_instruct_prompt.format(system, user, assistant) + tokenizer.eos_token
        elif CONFIG.model_type == "vision":
            text = processor.bos_token + llama_3_instruct_prompt.format(system, user, assistant) + processor.eos_token
        else:
            raise ValueError("Invalid model type")
        texts.append(text)
    return {"text": texts}

In [9]:
# Generate base model
def generate_text(prompt):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        prompt
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

# Generate instruction model
def generate_response(system, user):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        llama_3_instruct_prompt.format(
            system,
            user,
            ""
        )
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

# Generate vision model
def generate_vision(system, user):
    FastVisionModel.for_inference(model)
    input_text = [
        llama_3_instruct_prompt.format(
            system,
            user,
            ""
        )
    ]
    inputs = processor(
        images=None,
        texts=input_text,
        return_tensors = "pt"
    ).to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [10]:
def plot_token_length(fields):
    for field in fields:
        token_lengths = [len(tokenizer.encode(example[field])) for example in dataset if example[field] != ""]
        
        plt.figure(figsize=(10, 5))
        plt.hist(token_lengths, bins=50, color='skyblue', edgecolor='black')
        plt.xlabel(f'{field.capitalize()} Length')
        plt.ylabel('Frequency')
        plt.title(f'{field.capitalize()} Token Length Distribution')
        plt.show()
        
        print(f"Max {field} token length: {max(token_lengths)}")
        print(f"Min {field} token length: {min(token_lengths)}")
        print(f"Mean {field} token length: {np.mean(token_lengths):.2f}")
        print(f"Standard deviation of {field} token length: {np.std(token_lengths):.2f}")

# Model

In [11]:
if CONFIG.model_type == "language":
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=CONFIG.model_id,
        max_seq_length=CONFIG.max_seq_length,
        dtype=CONFIG.dtype,
        load_in_4bit=CONFIG.load_in_4bit if CONFIG.lora else False
    )
elif CONFIG.model_type == "vision":
    model, processor = FastVisionModel.from_pretrained(
        model_name=CONFIG.model_id,
        max_seq_length=CONFIG.max_seq_length,
        dtype=CONFIG.dtype,
        load_in_4bit=CONFIG.load_in_4bit if CONFIG.lora else False
    )
else:
    raise ValueError("Invalid model type")

==((====))==  Unsloth 2024.11.8: Fast Llama patching. Transformers = 4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [12]:
#print(f"Vocabulary size: {tokenizer.vocab_size}")
#print(f"Special tokens: {tokenizer.all_special_tokens}")

In [13]:
print(model)
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f}B")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

In [14]:
if CONFIG.debug:
    sample_system = "You are a helpful assistant."
    sample_user = "What is the capital of France?"
    if CONFIG.model_type == "language":
        sample_response = generate_response(sample_system, sample_user)
        print(sample_response)
        #print(tokenizer.tokenize(sample_response[0]))
    elif CONFIG.model_type == "vision":
        sample_response = generate_vision(sample_system, sample_user)
        print(sample_response)
    else:
        raise ValueError("Invalid model type")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The capital of France is Paris.<|eot_id|>
['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of France?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe capital of France is Paris.<|eot_id|>']


# Dataset

In [29]:
dataset = load_dataset(CONFIG.dataset_id, split="train")

In [16]:
#dataset = load_dataset('json', data_files='KUHrious_SFT_Dataset_transformed.jsonl', split='train')

In [30]:
dataset

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 100000
})

In [27]:
if CONFIG.debug:
    if CONFIG.dataset_template == "alpaca":
        print(f"instruction: {dataset[0]['instruction']}")
        print(f"input: {dataset[0]['input']}")
        print(f"output: {dataset[0]['output']}")
    elif CONFIG.dataset_template == "chat":
        #print(f"conversations: {dataset[0]['features']}")
        #print(f"source: {dataset[0]['source']}")
        #print(f"score: {dataset[0]['score']}")
        print(dataset[0])
        print(dataset[1])

{'from': 'human', 'value': 'What are the key dates for course registration in the first semester of 2024?', 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are the key dates for course registration in the first semester of 2024?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>'}
{'from': 'gpt', 'value': 'The key dates for course registration in the first semester of 2024 are as follows:\\n\\n1. **Course Registration Confirmation and Changes:** March 4 to March 8, 2024. During this period, students can confirm their course selections and make any necessary changes.\\n2. **Credit Withdrawal Applications:** This is also available from March 4 to March 8, allowing students to withdraw from courses without penalty.\\n3. **Double Majors and Minors Application Period:** From March 11 to March 13, 2024, students can apply for double majors and minors.\\n4. **Course Withdrawal Applications:** 

## Preprocessing

In [24]:
def apply_llama_template(examples):
    texts = []
    for from_value, value in zip(examples['from'], examples['value']):
        system = ""
        user = ""
        assistant = ""
        
        if from_value == 'system':
            system = value
        elif from_value == 'human':
            user = value
        elif from_value == 'gpt':
            assistant = value
        
        if CONFIG.model_type == "language":
            text = tokenizer.bos_token + llama_3_instruct_prompt.format(system, user, assistant) + tokenizer.eos_token
        elif CONFIG.model_type == "vision":
            text = processor.bos_token + llama_3_instruct_prompt.format(system, user, assistant) + processor.eos_token
        else:
            raise ValueError("Invalid model type")
        
        texts.append(text)
    return {"text": texts}

In [25]:
if CONFIG.dataset_template == "alpaca":
    formatting_func = apply_alpaca_template
elif CONFIG.dataset_template == "chat":
    formatting_func = apply_llama_template
else:
    raise ValueError("Invalid dataset template")

dataset = dataset.map(formatting_func, batched=True)

Map:   0%|          | 0/13986 [00:00<?, ? examples/s]

In [26]:
if CONFIG.debug:
    print(dataset[0]["text"])
    #print(tokenizer.tokenize(dataset[0]["text"]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

<|eot_id|><|start_header_id|>user<|end_header_id|>

What are the key dates for course registration in the first semester of 2024?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<|eot_id|>


In [None]:
if CONFIG.debug:
    plot_token_length(["text"])

# Supervised Fine-Tuning (LoRA)

In [None]:
if CONFIG.lora:
    if CONFIG.model_type == "language":
        model = FastLanguageModel.get_peft_model(
            model,
            r=CONFIG.r,
            target_modules=CONFIG.target_modules,
            lora_alpha=CONFIG.lora_alpha,
            lora_dropout=CONFIG.lora_dropout,
            bias=CONFIG.bias,
            use_gradient_checkpointing=CONFIG.use_gradient_checkpointing,
            use_rslora=CONFIG.use_rslora,
            loftq_config=CONFIG.loftq_config,
            random_state=CONFIG.seed
        )
    elif CONFIG.model_type == "vision":
        model = FastVisionModel.get_peft_model(
            model,
            finetune_vision_layers     = False, # False if not finetuning vision layers
            finetune_language_layers   = True, # False if not finetuning language layers
            finetune_attention_modules = True, # False if not finetuning attention layers
            finetune_mlp_modules       = True, # False if not finetuning MLP layers
            
            r=CONFIG.r,
            target_modules=CONFIG.target_modules,
            lora_alpha=CONFIG.lora_alpha,
            lora_dropout=CONFIG.lora_dropout,
            bias=CONFIG.bias,
            use_gradient_checkpointing=CONFIG.use_gradient_checkpointing,
            use_rslora=CONFIG.use_rslora,
            loftq_config=CONFIG.loftq_config,
            random_state=CONFIG.seed
        )

In [None]:
if CONFIG.lora:
    model.print_trainable_parameters()

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=CONFIG.max_seq_length,
    dataset_num_proc=CONFIG.dataset_num_proc,
    packing=CONFIG.packing,
    data_collator=UnslothVisionDataCollator(model, tokenizer) if CONFIG.model_type == "vision" else None,
    args=SFTConfig(
        output_dir=CONFIG.output_dir,
        logging_dir=CONFIG.logging_dir,
        save_strategy=CONFIG.save_strategy,
        logging_strategy=CONFIG.logging_strategy,
        logging_steps=CONFIG.logging_steps,
        save_total_limit=CONFIG.save_total_limit,
        report_to=CONFIG.report_to,
        num_train_epochs=CONFIG.num_train_epochs,
        per_device_train_batch_size=CONFIG.per_device_train_batch_size,
        gradient_accumulation_steps=CONFIG.gradient_accumulation_steps,
        fp16=CONFIG.fp16,
        bf16=CONFIG.bf16,
        learning_rate=CONFIG.learning_rate,
        lr_scheduler_type=CONFIG.lr_scheduler_type,
        warmup_ratio=CONFIG.warmup_ratio,
        optim=CONFIG.optim,
        weight_decay=CONFIG.weight_decay
    )
)

In [None]:
trainer.train()

In [None]:
if not CONFIG.debug:
    wandb.finish()
    if CONFIG.lora:
        model.save_pretrained(CONFIG.model_name + "-LoRA")
        tokenizer.save_pretrained(CONFIG.model_name + "-LoRA")
    else:
        model.save_pretrained(CONFIG.model_name)
        tokenizer.save_pretrained(CONFIG.model_name)

# Inference

In [None]:
if CONFIG.lora:
    sample_system = "You are a helpful assistant."
    sample_user = "What is the capital of France?"
    if CONFIG.model_type == "language":
        sample_response = generate_response(sample_system, sample_user)
    elif CONFIG.model_type == "vision":
        sample_response = generate_vision(sample_system, sample_user)
    else:
        raise ValueError("Invalid model type")
    print(sample_response)
    print(tokenizer.tokenize(sample_response[0]))

# Save

In [None]:
if not CONFIG.debug:
    if CONFIG.lora:
        model.save_pretrained_merged(
            CONFIG.model_name,
            tokenizer,
            save_method=CONFIG.save_method
        )
        model.push_to_hub_merged(
            CONFIG.model_name,
            tokenizer,
            save_method=CONFIG.save_method
        )
    else:
        model.push_to_hub(
            repo_id=CONFIG.username + "/" + CONFIG.model_name,
            use_temp_dir=False
        )
        tokenizer.push_to_hub(
            repo_id=CONFIG.username + "/" + CONFIG.model_name,
            use_temp_dir=False
        )