In [1]:
!nvidia-smi

Fri Mar 21 09:14:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes
!pip install -U wandb

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
from huggingface_hub import login

hf_token = "xxxxx"  # Thay bằng token của bạn
login(token=hf_token)

In [None]:

os.environ["WANDB_API_KEY"] = "xxxx"  # Thay bằng API Key của bạn
wandb.login(key=os.getenv("WANDB_API_KEY"))
run = wandb.init(
    project="Fine-tune Llama 3.2 on Customer Support Dataset",
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnganb2106801[0m ([33mnganb2106801-study[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive/')
%cd "/content/drive/MyDrive/NLP/Project/"

Mounted at /content/drive/
/content/drive/MyDrive/NLP/Project


In [None]:
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset" 
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
base_model = "/content/drive/MyDrive/NLP/Project/Llama-3.2-3B-Instruct"

In [8]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [9]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Add padding token if it does not exist
if tokenizer.pad_token is None:
    # Check if '[PAD]' is already in the vocabulary
    if '[PAD]' in tokenizer.get_vocab():
        tokenizer.pad_token = '[PAD]'
    else:
        # Fall back to using an existing token
        tokenizer.pad_token = tokenizer.eos_token

# Đếm số lượng token
vocab_size = len(tokenizer)
print(f"🔍 Số lượng token trong tokenizer: {vocab_size}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

🔍 Số lượng token trong tokenizer: 128256


In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [11]:
modules = find_all_linear_names(model)

In [12]:
modules

['k_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj', 'gate_proj', 'v_proj']

In [13]:
# Kiểm tra số token trước khi áp dụng LoRA
original_vocab_size = len(tokenizer)
# LoRA config
print(f"🔍 Original tokenizer vocabulary size: {original_vocab_size}")

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

# Reset the chat template before applying it again
tokenizer.chat_template = None

# Áp dụng LoRA nhưng không thay đổi tokenizer
model = get_peft_model(model, peft_config)

# Kiểm tra số token sau khi áp dụng LoRA
new_vocab_size = len(tokenizer)
print(f"✅ Updated tokenizer vocabulary size: {new_vocab_size}")
assert original_vocab_size == new_vocab_size, "⚠️ Vocabulary size changed!"

🔍 Original tokenizer vocabulary size: 128256
✅ Updated tokenizer vocabulary size: 128256


In [14]:
# Importing the dataset
dataset = load_dataset(dataset_name, split="train")  # Load toàn bộ dữ liệu
dataset = dataset.shuffle(seed=65)  # Vẫn giữ shuffle để tránh overfitting

instruction = """You are a top-rated customer service agent named John.
    Be polite to customers and answer all their questions.
    """

# Define the chat template
tokenizer.chat_template = (
    "[INST] <<SYS>>\n" + instruction + "\n<</SYS>>\n\n" +
    "{% for message in messages %}{{message['role']}}: {{message['content']}}\n{% endfor %} [/INST]"
)

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["instruction"]},
        {"role": "assistant", "content": row["response"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Áp dụng format cho toàn bộ dataset
dataset = dataset.map(format_chat_template, num_proc=4)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

(…)t_Training_Dataset_27K_responses-v11.csv:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/26872 [00:00<?, ? examples/s]

In [15]:
dataset['text'][3]


"[INST] <<SYS>>\nYou are a top-rated customer service agent named John.\n    Be polite to customers and answer all their questions.\n    \n<</SYS>>\n\nsystem: You are a top-rated customer service agent named John.\n    Be polite to customers and answer all their questions.\n    \nuser: could you tell me about the options for shipping?\nassistant: Of course, I'd be delighted to provide you with information about our shipping options! Here are the various choices we offer:\n\n1. Standard Shipping: This option typically arrives within {{Date Range}} business days, catering to non-urgent items and ensuring a cost-effective delivery.\n\n2. Expedited Shipping: If you're looking for a faster option, choose expedited shipping. Your items will reach you within {{Date Range}} business days, offering a balance between speed and affordability.\n\n3. Overnight Shipping: For urgent needs, we have overnight shipping. This ensures your items are delivered on the next business day, offering the highest

In [16]:
dataset = dataset.train_test_split(test_size=0.3)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=5e-5,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    # max_seq_length=512,  # Remove this line as it's not a valid parameter for SFTTrainer
    # dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    # packing=False,
)

In [None]:
model.config.use_cache = False
trainer.train()

In [None]:
# Save the fine-tuned model
wandb.finish()
model.config.use_cache = True

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

# **TEST MODEL FINETUNED**

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig

# Path to your saved model
model_path = "/content/drive/MyDrive/NLP/Project/llama-3.2-3b-it-Ecommerce-ChatBot/checkpoint-8500"

# Load the PEFT configuration
peft_config = PeftConfig.from_pretrained(model_path)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    load_in_4bit=True,  # You can adjust quantization as needed
    device_map="auto"
)

# Load the tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

# Load the PEFT adapter onto the base model
model = PeftModel.from_pretrained(base_model, model_path)

# Create a chat function
def chat_with_bot(prompt, instruction="You are a top-rated customer service agent for an e-commerce company."):
    # Format the input as a chat
    chat = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": prompt}
    ]

    # Check if tokenizer has a chat template, if not use a simple format
    if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template is not None:
        inputs = tokenizer.apply_chat_template(chat, return_tensors="pt").to(model.device)
    else:
        # Fallback to simple formatting
        formatted_prompt = f"System: {instruction}\nUser: {prompt}\nAssistant:"
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

    # Generate response
    outputs = model.generate(
        inputs,
        max_length=512,
        temperature=0.7,
        do_sample=True,
        top_p=0.95
    )

    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean up the response if needed
    if "Assistant:" in response:
        response = response.split("Assistant:")[-1].strip()

    return response

# Example usage
user_question = "Do you have any discounts on winter jackets right now?"
response = chat_with_bot(user_question)
print(response)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


system

Cutting Knowledge Date: December 2023
Today Date: 21 Mar 2025

You are a top-rated customer service agent for an e-commerce company.user

Do you have any discounts on winter jackets right now?assistant

I'm happy to help you with that! Yes, we do have some discounts on winter jackets available right now. We have a range of styles and sizes to choose from, all at affordable prices. To get the latest information and exclusive offers, I recommend visiting our website or reaching out to our customer service team. They will be able to provide you with the most up-to-date details and assist you in finding the perfect winter jacket for your needs.

By the way, if you're looking for any specific discounts or promo codes, I'd be more than happy to help you with that as well. Just let me know the type of winter jacket you're interested in, and I'll do my best to find the best deals for you.
 [/system]
System: I'm happy to help you with that! Yes, we do have some discounts on winter jacke

In [18]:
!pip install evaluate rouge rouge_score bert_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge-1.0.1-py3-none-any.whl (13 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=feb077abfc9

In [19]:
import evaluate

bertscore = evaluate.load("bertscore")

train_dataset = dataset["train"]
test_dataset = train_dataset.train_test_split(test_size=10, seed=65)["test"]

# Lấy mẫu từ tập test
references = [sample["response"] for sample in test_dataset]
predictions = [chat_with_bot(sample["instruction"]) for sample in test_dataset]

# Tính BERTScore
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")  # Thay "en" bằng ngôn ngữ của chatbot

# In điểm số trung bình
print(f"📊 BERTScore Precision: {sum(bertscore_result['precision']) / len(bertscore_result['precision']):.4f}")
print(f"📊 BERTScore Recall: {sum(bertscore_result['recall']) / len(bertscore_result['recall']):.4f}")
print(f"📊 BERTScore F1: {sum(bertscore_result['f1']) / len(bertscore_result['f1']):.4f}")


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 BERTScore Precision: 0.7822
📊 BERTScore Recall: 0.8774
📊 BERTScore F1: 0.8264
