# Classification : Food, Service, Delivery, Ambience, Price

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install iterative-stratification

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import os
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
# Load dữ liệu
df = pd.read_csv("/content/label.csv")  # hoặc dùng df trực tiếp

# Binarize nhãn
mlb = MultiLabelBinarizer()
df["labels"] = df["labels"].apply(eval)  # nếu label là dạng chuỗi list
y = mlb.fit_transform(df["labels"])

# Văn bản
texts = df["comment"].astype(str).tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
class CommentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

dataset = CommentDataset(encodings, y)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "vinai/phobert-base",
    num_labels=y.shape[1],
    problem_type="multi_label_classification"
)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",                # nơi lưu model
    num_train_epochs=10,                    # số epoch
    per_device_train_batch_size=8,        # batch size train
    per_device_eval_batch_size=8,         # batch size eval
    logging_dir="./logs",                  # nơi lưu log
    logging_steps=100,                     # log mỗi 1000 bước
    save_total_limit=1,                    # chỉ giữ 1 checkpoint mới nhất
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in mskf.split(texts, y):
    X_train, X_eval = [texts[i] for i in train_idx], [texts[i] for i in val_idx]
    y_train, y_eval = y[train_idx], y[val_idx]
    break  # chỉ lấy fold đầu tiên

# Tokenize
train_encodings = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
eval_encodings = tokenizer(X_eval, padding=True, truncation=True, return_tensors="pt")

# Dataset
train_dataset = CommentDataset(train_encodings, y_train)
eval_dataset = CommentDataset(eval_encodings, y_eval)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # <-- bắt buộc nếu eval_strategy != "no"
)

In [None]:
trainer.train()

Step,Training Loss
100,0.4731
200,0.2819
300,0.166
400,0.1099
500,0.0799


TrainOutput(global_step=510, training_loss=0.21914206965296876, metrics={'train_runtime': 3299.1153, 'train_samples_per_second': 1.231, 'train_steps_per_second': 0.155, 'total_flos': 106408676911320.0, 'train_loss': 0.21914206965296876, 'epoch': 10.0})

In [None]:
def predict_labels(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits)  # multi-label
        preds = (probs > 0.5).int()  # ngưỡng phân loại
    return preds.cpu().numpy()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/GRAB/results/checkpoint-9201",
    # num_labels=5,
    # problem_type="multi_label_classification"
)

In [None]:
reviews = pd.read_csv("/content/reviews.csv")

  reviews = pd.read_csv("/content/reviews.csv")


In [None]:
import re

# Danh sách các từ nối muốn split
conjs = ["nhưng", "mà", "bởi vì", "tuy nhiên", "và", "hay", "hoặc", "hoặc là", "cũng như", "vì vậy", "do đó", "thế nhưng", "thế mà", "dù sao đi nữa", "cũng như là", "lẫn", "cùng", "ngoài ra", "vì", "bởi vì", "nên", "vậy", "do đó", "nếu", "trừ phi", "trừ khi", "hễ", "hơn", "bằng", "như", "mặc dù", "dù chơ", "tuy nhiên", "sau khi", "khi", "để nhằm", "mục đích", "ngược lại"]
# Xây pattern: non-capturing group của (dấu câu hoặc từ nối), rồi có thể kèm space
pattern = r"(?:[\.!\?;,]+|\b(?:" + "|".join(map(re.escape, conjs)) + r")\b)\s*"

def segment_sentences(text):
    # re.split với pattern này sẽ loại bỏ luôn phần separator
    parts = re.split(pattern, str(text))
    # strip và loại bỏ empty
    return [p.strip() for p in parts if p.strip()]

# Ví dụ apply lên DataFrame
test = reviews["review_text"][:10].apply(segment_sentences)

In [None]:
texts = []
for components in test:
  for component in components:
    texts.append(component)

In [None]:
# test_comments = [
#     "Món ăn dở, giao hàng rất chậm",
#     "Giá cao mà phục vụ thì tệ",
#     "Đóng gói đẹp, nhân viên thân thiện"
# ]
label_names = ["ambience", "delivery", "food", "price", "service"]

predictions = predict_labels(texts)

for text, pred in zip(texts, predictions):
    labels = tuple(label_names[i] for i, val in enumerate(pred) if val == 1)
    print(f"\n📝 {text}\nLabels: {labels}")


📝 nói chung là quá dỡ quá dơ
Labels: ('ambience',)

📝 đặt gà bơ tỏi nhưng lại giao gà thường
Labels: ('delivery', 'service')

📝 Đóng gói chưa tốt
Labels: ('service',)

📝 Sai/Thiếu món
Labels: ('food', 'service')

📝 nước đâu
Labels: ('food',)

📝 lúc nhìn đơn nên để ý chút nha :)
Labels: ('service',)

📝 gà ngon
Labels: ('food',)

📝 đến nơi vẫn còn nóng
Labels: ('ambience',)

📝 không có dưa chua
Labels: ('service',)

📝 chỉ có mỗi túi tương cà
Labels: ('service',)

📝 có đặt thêm phần cải thìa nhưng ko thấy đâu
Labels: ('service',)

📝 nhà hàng làm món quá lau mình đợi hơn 1 tieng đong hồ mới có món an
Labels: ('food',)

📝 đói mà cào ruot lun
Labels: ('delivery',)

📝 com thì khẩu phan ít cục bò nhỏ xíu như cho e bé an
Labels: ('food', 'price')

📝 Không hợp khẩu vị
Labels: ('food',)

📝 thức ăn nhạt
Labels: ('food',)

📝 hình như không có mắm tỏi ăn nhạt toẹt
Labels: ('delivery', 'food', 'service')

📝 quá nhiều dầu
Labels: ('food',)

📝 đồ chua chưa đủ chua
Labels: ('delivery', 'food')

📝 23k đ

# Classification : Positive, Negative, Neutral

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import pandas as pd
import torch
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# 1. Đọc và xử lý dữ liệu
df = pd.read_csv("label_sentiment.csv")  # cột 'components', 'sentiment'
df = df.dropna(subset=["sentiment"])

label_map = {"NEG": 0, "NEU": 1, "POS": 2}
df["label_id"] = df["sentiment"].map(label_map)

In [None]:
df

Unnamed: 0,components,sentiment,label_id
0,Ngon xỉu,POS,2
1,Đáng đồng tiền,POS,2
2,Sạch sẽ,POS,2
3,Đóng gói tốt,POS,2
5,Làm món nhanh,POS,2
...,...,...,...
1097,Tuy nhiên cần chú ý đến ghi chú của khách,NEU,1
1098,Bánh hơi béo,NEU,1
1099,hơi đắt 1 tí,NEU,1
1100,do ăn hơi đắt nha so với quảng cáo 1 khoanh cá...,NEU,1


In [None]:
# 2. Tách train/test
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["components"].tolist(),
    df["label_id"].tolist(),
    test_size=0.2,
    stratify=df["label_id"],
    random_state=42
)

In [None]:
# 3. Tokenizer
model_id = "wonrax/phobert-base-vietnamese-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
# 4. Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

In [None]:
# 5. Load model
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=3)

In [None]:
pwd

'/content'

In [None]:
# 6. Huấn luyện
training_args = TrainingArguments(
    output_dir="./results",                # nơi lưu model
    num_train_epochs=10,                    # số epoch
    per_device_train_batch_size=8,        # batch size train
    per_device_eval_batch_size=8,         # batch size eval
    logging_dir="./logs",                  # nơi lưu log
    logging_steps=100,                     # log mỗi 100 bước
    save_total_limit=1,                    # chỉ giữ 1 checkpoint mới nhất
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,0.6433
200,0.2895
300,0.2305
400,0.1167
500,0.0699
600,0.0492
700,0.0425
800,0.045
900,0.0106
1000,0.0254


TrainOutput(global_step=1100, training_loss=0.14025829141790216, metrics={'train_runtime': 5833.8028, 'train_samples_per_second': 1.508, 'train_steps_per_second': 0.189, 'total_flos': 140190117883200.0, 'train_loss': 0.14025829141790216, 'epoch': 10.0})