# Fine-Tuning Llama Model on Arxiv Dataset

## 1. Setup

In [1]:
# !pip install -r requirements.txt

In [2]:
# check torch GPU
import torch
print(torch.cuda.is_available())

True


In [3]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("Cornell-University/arxiv", path='Cornell-University/arxiv')

# print("Path to dataset files:", path)

## 2. Load necessary modules and configuration

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from transformers import DataCollatorWithPadding
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
import bitsandbytes as bnb
import torch
import os

# Paths
BASE_MODEL = "./models/Llama-3.2-1B"
DATASET = "./data/Cornell-University/arxiv"
DATASET_MAP = "./data/Cornell-University/arxiv_tokenized_dataset"
OUTPUT_DIR = "./fine_tuned_models/"
SUBSET_RATIO = 0.01  # 使用 1% 的資料

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

  from .autonotebook import tqdm as notebook_tqdm


## 3. Data preprocessing

In [5]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import os

# Set pad token to eos token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # use the <eos> token as the pad token

# Load dataset
dataset = load_dataset(DATASET, split="train")



# Preprocess function
def preprocess_function(examples):
    inputs = tokenizer(
        examples["abstract"], 
        truncation=True, 
        padding="max_length", 
        max_length=256 # Default: 512
    )
    outputs = tokenizer(
        examples["title"], 
        truncation=True, 
        padding="max_length", 
        max_length=256 # Default: 512
    )
    
    # 計算 loss 時，忽略 padding 部分
    # 確保 labels 與 inputs 對齊長度
    labels = outputs["input_ids"]
    inputs["labels"] = [
        (label if label != tokenizer.pad_token_id else -100) 
        for label in labels
    ]
    ## Why -100?
    ### Hugging Face 的預設行為：在 Trainer 和大多數損失函數中，-100 是一個特殊值，表示該位置應被忽略。
    
    return inputs

# Apply preprocess function to dataset (tokenize)
if not os.path.exists(f"{DATASET_MAP}_{SUBSET_RATIO*100}_percent"):
    
    # 隨機抽取子集
    subset_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * SUBSET_RATIO)))
    
    tokenized_dataset = subset_dataset.map(preprocess_function, batched=True)
    tokenized_dataset.save_to_disk(f"{DATASET_MAP}_{SUBSET_RATIO*100}_percent")
    print("Tokenized dataset saved to disk.")
else:
    tokenized_dataset = Dataset.load_from_disk(f"{DATASET_MAP}_{SUBSET_RATIO*100}_percent")
    print("Tokenized dataset loaded from disk.")

print(f"使用的資料集大小: {len(tokenized_dataset)} 個樣本")

Tokenized dataset loaded from disk.
使用的資料集大小: 26431 個樣本


## 4. FineTuning

### 4.1 SFT (Standard Fine-Tuning)

In [6]:
# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [7]:
# # Load model for SFT
# sft_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
# sft_model.gradient_checkpointing_enable()


# # Set model config
# sft_training_args = TrainingArguments(
#     output_dir=os.path.join(OUTPUT_DIR, "sft"),
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=8,  # 累積梯度
#     num_train_epochs=3,
#     logging_dir="./logs",
#     save_strategy="epoch",
#     fp16=True,
#     push_to_hub=False
# )

# # Initialize Trainer
# sft_trainer = Trainer(
#     model=sft_model,
#     args=sft_training_args,
#     train_dataset=tokenized_dataset,
#     data_collator=data_collator  # 動態填充
# )

# # Train model
# sft_trainer.train()
# ## 一直 OOM，SFT 無法在這台機器上訓練，直接進 LoRA


### 4.2 LoRA

In [36]:
# Reload base model for LoRA 
lora_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
lora_model.gradient_checkpointing_enable()

# Set LoRA config
lora_config = LoraConfig(
    r=4, # default: 8
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the base model
lora_model = get_peft_model(lora_model, lora_config)

# Set LoRA training args
lora_training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "lora"),
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # 累積梯度
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    push_to_hub=False
)

# Initialize LoRA Trainer
lora_trainer = Trainer(
    model=lora_model,
    args=lora_training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,  # 動態填充
    optimizers=(bnb.optim.Adam8bit(lora_model.parameters(), lr=1e-4), None)  # 使用 8-bit Adam 優化器，減少記憶體使用 (預設使用 AdamW)
)

# Train LoRA model
lora_trainer.train()


Step,Training Loss
500,3.7702
1000,2.0996
1500,2.0723
2000,2.0585
2500,2.0413
3000,2.0445
3500,2.0636
4000,2.0415
4500,2.0222
5000,2.0178


TrainOutput(global_step=9912, training_loss=2.12113493272044, metrics={'train_runtime': 2835.5627, 'train_samples_per_second': 27.964, 'train_steps_per_second': 3.496, 'total_flos': 1.1859524132850893e+17, 'train_loss': 2.12113493272044, 'epoch': 3.0})

### 4.3 QLoRA

In [None]:
from peft import prepare_model_for_kbit_training

# Define quantization config for QLoRA
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load 4-bit QLoRA model
qlora_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config
)
qlora_model.gradient_checkpointing_enable()
# qlora_model.train()  # Ensure the model is in training mode
qlora_model = prepare_model_for_kbit_training(qlora_model, use_gradient_checkpointing = False)

# Set QLoRA config
qlora_config = LoraConfig(
    r=64,  # Typically higher for QLoRA
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply QLoRA to the base model
qlora_model = get_peft_model(qlora_model, qlora_config)

# Ensure only floating point parameters require gradients
for param in qlora_model.parameters():
    if param.dtype in [torch.bfloat16, torch.float32, torch.float64, torch.complex64, torch.complex128]:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Set QLoRA training args
qlora_training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "qlora"),
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # 累積梯度
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=True,
    push_to_hub=False
)

# Initialize QLoRA Trainer
qlora_trainer = Trainer(
    model=qlora_model,
    args=qlora_training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,  # 動態填充
    optimizers=(bnb.optim.Adam8bit(qlora_model.parameters(), lr=1e-4), None)  # 使用 8-bit Adam 優化器，減少記憶體使用 (預設使用 AdamW)
)

# Train QLoRA model
qlora_trainer.train()


## 5. Evalution

In [None]:
from evaluate import load
metric = load("rouge")

# Example: Evaluate SFT Model
predictions = sft_trainer.predict(tokenized_dataset)
results = metric.compute(predictions=predictions.predictions, references=predictions.label_ids)
print("SFT Model Performance:", results)


####
# from evaluate import load
# metric = load("rouge")

# # Evaluate Summarization
# predictions = ["Quantum mechanics explains nanoscopic phenomena."]
# references = ["Quantum mechanics describes physical phenomena on small scales."]
# results = metric.compute(predictions=predictions, references=references)
# print("ROUGE scores:", results)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. 加载模型和 Tokenizer
def load_model_and_tokenizer(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# 2. 生成响应
def generate_response(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# 3. 测试函数
def test_model(model_name, prompt):
    model, tokenizer = load_model_and_tokenizer(model_name)
    print(f"Testing model: {model_name}")
    response = generate_response(model, tokenizer, prompt)
    print(f"Prompt: {prompt}")
    print(f"Response: {response}")


In [None]:
prompt = "Explain quantum mechanics in simple terms."
test_model("meta-llama/Llama-3.2-3b-hf", prompt)  # Base model
test_model("your_hf_username/llama-3.2-3b-qlora", prompt)  # Fine-tuned model


In [None]:
# Push models to the Hugging Face Hub
# 推送 SFT 模型
sft_model.push_to_hub(f"{HF_USERNAME}/llama-3.2-3b-sft")

# 推送 LoRA 模型
lora_model.push_to_hub(f"{HF_USERNAME}/llama-3.2-3b-lora")

# 推送 QLoRA 模型
qlora_model.push_to_hub(f"{HF_USERNAME}/llama-3.2-3b-qlora")
