In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# pip installs

!pip install -q datasets peft requests torch bitsandbytes transformers trl accelerate sentencepiece matplotlib

In [None]:
# imports

import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
import wandb
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
# set check point tracker
# Path to store the checkpoint tracking information
def create_checkpoint_tracker():
    checkpoint_file = "checkpoint_tracker.py"

    with open(checkpoint_file, "w") as f:
        f.write("""
def get_latest_step():
    try:
        with open("latest_step.txt", "r") as f:
            return int(f.read().strip())
    except:
        return 0

def save_latest_step(step):
    with open("latest_step.txt", "w") as f:
        f.write(str(step))
""")

create_checkpoint_tracker()
from checkpoint_tracker import get_latest_step, save_latest_step

#Setup STF-Config

In [None]:
# resume training function
def train_or_resume(
    base_model_name,  #Model gốc để fine-tune (vd: "meta-llama/Llama-3-8B")
    hf_model_name, #Tên repo Hugging Face để push kết quả lên
    train_dataset,
    lora_config, #Cấu hình LoRA (Low-Rank Adaptation)
    steps_per_session=500, #Số bước huấn luyện mỗi phiên (session)
    max_total_steps=1000, #Tổng số bước huấn luyện tối đa
    batch_size=1,
    grad_accum_steps=16, #Gradient accumulation steps (tích lũy gradient để tiết kiệm VRAM)
    save_steps=100 #Lưu checkpoint mỗi bao nhiêu bước
):
    """
    Train a model or resume training from the latest checkpoint on Hugging Face.

    Args:
        base_model_name: Original model to fine-tune or 'resume' to continue training
        hf_model_name: HF repo name to save model to (username/model-name)
        train_dataset: Dataset to train on
        lora_config: LoRA configuration
        steps_per_session: How many steps to train in this session
        max_total_steps: Maximum number of steps to train overall
        batch_size: Batch size for training
        grad_accum_steps: Gradient accumulation steps
        save_steps: How often to save checkpoints
    """
    # Get the latest step we've trained to
    latest_step = get_latest_step()

    # Check if we've already reached the max steps
    if latest_step >= max_total_steps:
        print(f"Training already completed! Reached {latest_step}/{max_total_steps} steps")
        return

    # Calculate how many steps to train in this session
    steps_this_session = min(steps_per_session, max_total_steps - latest_step)
    print(f"Training for {steps_this_session} steps (total progress: {latest_step}/{max_total_steps})")

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Configure quantization -> Qlora 4bit
    quant_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    # Check if we need to resume training
    try:
        if latest_step > 0:
            print(f"Resuming from checkpoint at step {latest_step}")
            # Load from Hugging Face
            base_model = AutoModelForCausalLM.from_pretrained(
                hf_model_name,
                quantization_config=quant_config,
                device_map="auto",
            )
        else:
            print("Starting training from base model")
            # Start fresh
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quant_config,
                device_map="auto",
            )
    except Exception as e:
        print(f"Error loading model, starting fresh: {e}")
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=quant_config,
            device_map="auto",
        )

    # Configure training parameters #SFTConfig #Supervised Fine-Tuning
    train_params = SFTConfig(
        output_dir=f"./checkpoints",  #Thư mục nơi model và checkpoint sẽ được lưu tạm thời trong quá trình huấn luyện.
        num_train_epochs=1, #Có thể tăng 4
        max_steps=steps_this_session, #Giới hạn số bước train tối đa trong phiên huấn luyện hiện tại.
        per_device_train_batch_size=batch_size, #Batch size thực tế (số mẫu dữ liệu đưa vào GPU mỗi lần huấn luyện).
        gradient_accumulation_steps=grad_accum_steps, #16 -> Số bước tích lũy gradient trước khi cập nhật trọng số.
        optim="paged_adamw_32bit",#Ở đây dùng paged_adamw_32bit – phiên bản tối ưu bộ nhớ (paged) của AdamW, thích hợp khi train mô hình lớn (LLM) với LoRA và 4bit quantization.
        save_steps=save_steps, #sau 100 steps ở trên
        logging_steps=20, #Ghi log (hiển thị loss, learning rate, …) mỗi 20 bước.
        learning_rate=1e-4,  #Tốc độ học (learning rate) = 0.0001. -> càng cao càng học dễ sai , càng nhỏ càng học chậm (1e-4 hoặc 5e-5)
        weight_decay=0.001, #Hệ số “giảm trọng số” — tránh overfitting bằng cách phạt các trọng số lớn.
        fp16=False, #Kiểu dữ liệu huấn luyện (precision).
        bf16=True,#Kiểu dữ liệu huấn luyện (precision).
        max_grad_norm=0.3, #Giới hạn độ lớn của gradient (gradient clipping).
        warmup_ratio=0.03, #Trong 3% đầu của quá trình train, learning rate sẽ tăng dần từ 0 → 1e-4.
        group_by_length=True, #Gộp các mẫu có độ dài gần nhau trong batch.
        lr_scheduler_type="cosine",#Kiểu giảm learning rate theo đường cong cosine. #giảm dần learning rate -> bắt đầu giảm nhẹ giảm mạnh về sau và căn bằng về cuối
        push_to_hub=True, #Cho phép tự động đẩy model lên Hugging Face Hub sau khi train xong.
        hub_model_id=hf_model_name, #Tên repository trên Hugging Face (ví dụ "theanhtran/bookbot-lora-v1")
        hub_private_repo=True #Repo được để private (chỉ bạn xem được).
    )

    # Create trainer
    trainer = SFTTrainer(
        model=base_model,
        train_dataset=train_dataset,
        peft_config=lora_config,
        args=train_params,
    )

    # Train the model
    trainer.train()

    # Push to Hugging Face Hub
    trainer.model.push_to_hub(hf_model_name, private=True)

    # Update and save the latest step count
    save_latest_step(latest_step + steps_this_session)

    print(f"Completed training session ({latest_step + steps_this_session}/{max_total_steps} steps)")
    print(f"Model saved to HuggingFace: {hf_model_name}")

    return latest_step + steps_this_session

#Fine-tune and save model huggingface and log with wandb

In [None]:


# Login to Hugging Face
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
LOG_TO_WANDB = True
# Log in to Weights & Biases
wandb_api_key = userdata.get('WANDB_API_KEY')  #https://wandb.ai/site/  -> để xem chi tiết quá trình huấn luyện
os.environ["WANDB_API_KEY"] = wandb_api_key
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = "llms_fine_tune"
os.environ["WANDB_LOG_MODEL"] = "checkpoint" if LOG_TO_WANDB else "end"
os.environ["WANDB_WATCH"] = "gradients"

# Model and repository names
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
HF_USER = "tta1301"
PROJECT_NAME = "pricer"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_NAME}"

# Load your dataset
from datasets import load_dataset
dataset = load_dataset(f"{HF_USER}/bookpricer-data-clone")
train_data = dataset['train'].shuffle(seed=123).select(range(min(50000, len(dataset['train']))))


#LoRa ở đây
# LoRA configuration
lora_parameters = LoraConfig(
    lora_alpha=16, # = 2.r
    lora_dropout=0.1,  #Dropout
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)

# Create the response template and data collator
from transformers import DataCollatorForLanguageModeling
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Train or resume
current_step = train_or_resume(
    base_model_name=BASE_MODEL,
    hf_model_name=HUB_MODEL_NAME,
    train_dataset=train_data,
    lora_config=lora_parameters,
    steps_per_session=300,  # Train for 300 steps at a time (adjust as needed)
    max_total_steps=1000,   # Maximum total steps
    batch_size=1,
    grad_accum_steps=16,
    save_steps=100          # Save every 100 steps
)

print(f"Current training progress: {current_step}/1000 steps")

#TEst our fine-tuned model

In [None]:
# Constants

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
PROJECT_NAME = "pricer"
HF_USER = "Timi1511" # your HF name here! Or use mine if you just want to reproduce my results.

# The run itself

RUN_NAME = "2025-04-16_12.53.07"
PROJECT_RUN_NAME = f"{PROJECT_NAME}-{RUN_NAME}"
REVISION = "71dc08b7f8c91e521ed510a5752f97187e415a2e" # or REVISION = None
FINETUNED_MODEL = f"{HF_USER}/{PROJECT_RUN_NAME}"

# Data
DATASET_NAME = f"{HF_USER}/bookpricer-data"

# Hyperparameters for QLoRA

QUANT_4_BIT = True

%matplotlib inline

# Used for writing to output in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}

In [None]:
# Login to Hugging face

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

In [None]:
test[0]

In [None]:
# pick the right quantization

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [None]:
# Load the Tokenizer and the Model
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

# Load the fine-tuned model with PEFT
if REVISION:
  fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL, revision=REVISION)
else:
  fine_tuned_model = PeftModel.from_pretrained(base_model, FINETUNED_MODEL)


print(f"Memory footprint: {fine_tuned_model.get_memory_footprint() / 1e6:.1f} MB")

In [None]:
fine_tuned_model

In [None]:
def extract_price(s):
    if "Price is $" in s:
      contents = s.split("Price is $")[1]
      contents = contents.replace(',','')
      match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
      return float(match.group()) if match else 0
    return 0

In [None]:
extract_price("Price is $abc fand $999")

In [None]:
# Original prediction function takes the most likely next token

def model_predict(prompt):
    set_seed(123)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    attention_mask = torch.ones(inputs.shape, device="cuda")
    outputs = fine_tuned_model.generate(inputs, attention_mask=attention_mask, max_new_tokens=3, num_return_sequences=1)
    response = tokenizer.decode(outputs[0])
    return extract_price(response)

In [None]:
# An improved prediction function takes a weighted average of the top 3 choices
# This code would be more complex if we couldn't take advantage of the fact
# That Llama generates 1 token for any 3 digit number
import torch
import torch.nn.functional as F

top_K = 3

def improved_model_predict(prompt, device="cuda"):
    set_seed(123)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(inputs.shape, device=device)

    with torch.no_grad():
        outputs = fine_tuned_model(inputs, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :].to('cpu')

    next_token_probs = F.softmax(next_token_logits, dim=-1)
    top_prob, top_token_id = next_token_probs.topk(top_K)
    prices, weights = [], []
    for i in range(top_K):
      predicted_token = tokenizer.decode(top_token_id[0][i])
      probability = top_prob[0][i]
      try:
        result = float(predicted_token)
      except ValueError as e:
        result = 0.0
      if result > 0:
        prices.append(result)
        weights.append(probability)
    if not prices:
      return 0.0, 0.0
    total = sum(weights)
    weighted_prices = [price * weight / total for price, weight in zip(prices, weights)]
    return sum(weighted_prices).item()

In [None]:
class Tester:

    def __init__(self, predictor, data, title=None, size=300):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint["text"].split("\n\n")[1][:20] + "..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
Tester.test(improved_model_predict, test)