# Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
import random
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass

# PyTorch
import torch

# Huggingface
import huggingface_hub
from transformers import TextStreamer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

# Weights & Biases
import wandb

# Unsloth
from unsloth import FastLanguageModel, is_bf16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Configuration

In [2]:
@dataclass
class CONFIG:
    debug: bool = False
    
    # Model
    model_id: str = "PathFinderKR/KHU-Llama-3.2-11B"
    
    # HuggingFace Hub
    username: str = "PathFinderKR"
    model_name: str = f"KHU-Llama-3.2-11B-Instruct"
    
    # Data
    dataset_id: str = "mlabonne/FineTome-100k"  # "yahma/alpaca-cleaned"
    dataset_template: str = "chat"  # alpaca | chat
    
    # Training
    ## Paths
    output_dir: str = "./results"
    logging_dir: str = "./logs"
    save_strategy: str = "epoch"
    logging_strategy: str = "steps"
    logging_steps: int = 10
    save_total_limit: int = 1
    report_to: str = "wandb" if not debug else None
    ## Hyperparameters
    num_train_epochs: int = 1
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 4
    fp16: bool = not is_bf16_supported()
    bf16: bool = is_bf16_supported()
    dtype: torch.dtype = torch.bfloat16 if is_bf16_supported() else torch.float16
    load_in_4bit: bool = True
    learning_rate: float = 2e-5
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.1
    optim: str = "adamw_8bit"
    weight_decay: float = 0.01
    max_seq_length: int = 2048
    dataset_num_proc: int = 2
    packing: bool = True
    ### LoRA
    lora: bool = False
    if lora:
        r: int = 16
        target_modules = ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"]
        lora_alpha: int = 32
        lora_dropout: float = 0
        bias: str = "none"
        use_gradient_checkpointing: str = "unsloth"
        use_rslora: bool = False
        loftq_config: str = None
        save_method: str = "merged_16bit"
    
    # Inference
    max_new_tokens: int = 2048
    do_sample: bool = True
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    
    # Device
    device: torch.device = None
    
    # Seed
    seed: int = 42

## Reproducibility

In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

Seed: 42


## Device

In [4]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

> Running on GPU | Num of GPUs:  1


## Debugging

In [5]:
if CONFIG.debug:
    CONFIG.num_train_epochs = 1

## HuggingFace

In [6]:
load_dotenv()
huggingface_hub.login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


## Weights & Biases

In [7]:
if not CONFIG.debug:
    wandb.login(
        key=os.getenv("WANDB_API_KEY")
    )
    wandb.init(
        project=CONFIG.model_name
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc


# Utility Functions

In [8]:
# Template
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

llama_3_instruct_prompt = """<|start_header_id|>system<|end_header_id|>

{}<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}"""

# Formatting functions
def apply_alpaca_template(examples):
    texts = []
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = tokenizer.bos_token + alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

def apply_llama_template(examples):
    texts = []
    for conversation in examples['conversations']:
        system = ""
        user = ""
        assistant = ""
        for message in conversation:
            if message['from'] == 'system':
                system = message['value']
            elif message['from'] == 'human':
                user = message['value']
            elif message['from'] == 'gpt':
                assistant = message['value']
        text = tokenizer.bos_token + llama_3_instruct_prompt.format(system, user, assistant) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

In [10]:
# Generate base model
def generate_text(prompt):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        prompt
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

# Generate instruction model
def generate_response(system, user):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        llama_3_instruct_prompt.format(
            system,
            user,
            ""
        )
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [11]:
def plot_token_length(fields):
    for field in fields:
        token_lengths = [len(tokenizer.encode(example[field])) for example in dataset if example[field] != ""]
        
        plt.figure(figsize=(10, 5))
        plt.hist(token_lengths, bins=50, color='skyblue', edgecolor='black')
        plt.xlabel(f'{field.capitalize()} Length')
        plt.ylabel('Frequency')
        plt.title(f'{field.capitalize()} Token Length Distribution')
        plt.show()
        
        print(f"Max {field} token length: {max(token_lengths)}")
        print(f"Min {field} token length: {min(token_lengths)}")
        print(f"Mean {field} token length: {np.mean(token_lengths):.2f}")
        print(f"Standard deviation of {field} token length: {np.std(token_lengths):.2f}")

# Model

In [12]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=CONFIG.model_id,
    max_seq_length=CONFIG.max_seq_length,
    dtype=CONFIG.dtype,
    load_in_4bit=CONFIG.load_in_4bit if CONFIG.lora else False
)

==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [15]:
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.all_special_tokens}")

Vocabulary size: 128000
Special tokens: ['<|begin_of_text|>', '<|end_of_text|>', '<|finetune_right_pad_id|>']


In [16]:
print(model)
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f}B")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
 

In [17]:
if CONFIG.debug:
    sample_system = "You are a helpful assistant."
    sample_user = "What is the capital of France?"
    sample_response = generate_response(sample_system, sample_user)
    print(sample_response)
    print(tokenizer.tokenize(sample_response[0]))

# Dataset

In [18]:
dataset = load_dataset(CONFIG.dataset_id, split="train")

In [19]:
dataset

Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 100000
})

In [20]:
if CONFIG.debug:
    if CONFIG.dataset_template == "alpaca":
        print(f"instruction: {dataset[0]['instruction']}")
        print(f"input: {dataset[0]['input']}")
        print(f"output: {dataset[0]['output']}")
    elif CONFIG.dataset_template == "chat":
        print(f"conversations: {dataset[0]['conversations']}")
        print(f"source: {dataset[0]['source']}")
        print(f"score: {dataset[0]['score']}")

## Preprocessing

In [21]:
if CONFIG.dataset_template == "alpaca":
    formatting_func = apply_alpaca_template
elif CONFIG.dataset_template == "chat":
    formatting_func = apply_llama_template
else:
    raise ValueError("Invalid dataset template")

dataset = dataset.map(formatting_func, batched=True)

In [23]:
if CONFIG.debug:
    print(dataset[0]["text"])
    print(tokenizer.tokenize(dataset[0]["text"]))

In [24]:
if CONFIG.debug:
    plot_token_length(["text"])

# Supervised Fine-Tuning

In [25]:
if CONFIG.lora:
    model = FastLanguageModel.get_peft_model(
        model,
        r=CONFIG.r,
        target_modules=CONFIG.target_modules,
        lora_alpha=CONFIG.lora_alpha,
        lora_dropout=CONFIG.lora_dropout,
        bias=CONFIG.bias,
        use_gradient_checkpointing=CONFIG.use_gradient_checkpointing,
        use_rslora=CONFIG.use_rslora,
        loftq_config=CONFIG.loftq_config,
        random_state=CONFIG.seed
    )

In [26]:
if CONFIG.lora:
    model.print_trainable_parameters()

In [27]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=CONFIG.max_seq_length,
    dataset_num_proc=CONFIG.dataset_num_proc,
    packing=CONFIG.packing,
    args=SFTConfig(
        output_dir=CONFIG.output_dir,
        logging_dir=CONFIG.logging_dir,
        save_strategy=CONFIG.save_strategy,
        logging_strategy=CONFIG.logging_strategy,
        logging_steps=CONFIG.logging_steps,
        save_total_limit=CONFIG.save_total_limit,
        report_to=CONFIG.report_to,
        num_train_epochs=CONFIG.num_train_epochs,
        per_device_train_batch_size=CONFIG.per_device_train_batch_size,
        gradient_accumulation_steps=CONFIG.gradient_accumulation_steps,
        fp16=CONFIG.fp16,
        bf16=CONFIG.bf16,
        learning_rate=CONFIG.learning_rate,
        lr_scheduler_type=CONFIG.lr_scheduler_type,
        warmup_ratio=CONFIG.warmup_ratio,
        optim=CONFIG.optim,
        weight_decay=CONFIG.weight_decay
    )
)

Generating train split: 0 examples [00:00, ? examples/s]

In [28]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 32,456 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 16,228
 "-____-"     Number of trainable parameters = 973,146,112


Step,Training Loss
10,6.7967
20,6.9761
30,6.8067
40,6.3749
50,6.1583
60,5.6492
70,5.1749
80,5.0518
90,5.2756
100,5.3665


TrainOutput(global_step=16228, training_loss=4.647371123334825, metrics={'train_runtime': 8085.5188, 'train_samples_per_second': 4.014, 'train_steps_per_second': 2.007, 'total_flos': 3.8810947843365274e+17, 'train_loss': 4.647371123334825, 'epoch': 1.0})

In [29]:
if not CONFIG.debug:
    wandb.finish()
    if CONFIG.lora:
        model.save_pretrained(CONFIG.model_name + "-LoRA")
        tokenizer.save_pretrained(CONFIG.model_name + "-LoRA")
    else:
        model.save_pretrained(CONFIG.model_name)
        tokenizer.save_pretrained(CONFIG.model_name)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████
train/grad_norm,█▆▄▃▅▃▃▂▃▂▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▂▁▂▁▂▁▁▂▂▂▂▁
train/learning_rate,▂▃▅▆██████████▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▃▃▃▂▁▁▁▁▁▁
train/loss,▂▆▆▆██▇▂▇▃▆▅▃▄▇▂▇▇▆▇▆▄▃▇▆▄▆▅▄▁▆▂▄▄▆▃▃▄▂▄

0,1
total_flos,3.881094784336528e+17
train/epoch,1.0
train/global_step,16228.0
train/grad_norm,1.5625
train/learning_rate,0.0
train/loss,4.7473
train_loss,4.64737
train_runtime,8085.5188
train_samples_per_second,4.014
train_steps_per_second,2.007


# Save

In [30]:
if not CONFIG.debug:
    if CONFIG.lora:
        model.save_pretrained_merged(
            CONFIG.model_name,
            tokenizer,
            save_method=CONFIG.save_method
        )
        model.push_to_hub_merged(
            CONFIG.model_name,
            tokenizer,
            save_method=CONFIG.save_method
        )
    else:
        model.push_to_hub(
            repo_id=CONFIG.username + "/" + CONFIG.model_name,
            use_temp_dir=False
        )
        tokenizer.push_to_hub(
            repo_id=CONFIG.username + "/" + CONFIG.model_name,
            use_temp_dir=False
        )

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Saved model to https://huggingface.co/PathFinderKR/Llama-3.2-1B-Instruct-Pause_Token


No files have been modified since last commit. Skipping to prevent empty commit.
