In [None]:
import os
if not os.path.exists('/kaggle/working/install_done.txt'):
    print("--- Lần đầu chạy, đang cài đặt các thư viện cần thiết...")
    !pip install --quiet \
        "torch==2.3.1" "torchvision==0.18.1" "torchaudio==2.3.1" \
        "lightning[pytorch-extra]==2.2.5" \
        "scikit-learn==1.4.2" \
        "torchmetrics==0.11.4" \
        "transformers==4.42.3"
    with open('/kaggle/working/install_done.txt', 'w') as f:
        f.write('OK')
    print("--- Cài đặt hoàn tất.")
else:
    print("--- Thư viện đã được cài đặt. Bắt đầu chạy chương trình chính...")

In [None]:
import json, itertools, torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import Linear, CrossEntropyLoss
from torch.optim import AdamW
import lightning.pytorch as pl
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from torchmetrics.classification import MulticlassF1Score
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, jaccard_score, accuracy_score, hamming_loss
from lightning.pytorch.loggers import CSVLogger

ENTITY_CLASSES = ["kol", "product"]
ATTRIBUTE_CLASSES = ["color", "performance", "packaging", "texture", "price", "ingredients", "personality", "appearance", "skill", "authenticity", "brand_collaboration", "null"]
SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
_classes = list(itertools.product(ENTITY_CLASSES, ATTRIBUTE_CLASSES, SENTIMENT_CLASSES))
ID2LABEL = {index: "#".join(_class) for index, _class in enumerate(_classes)}
LABEL2ID = {v: k for k, v in ID2LABEL.items()}
NUM_CLASSES = len(ID2LABEL)

In [None]:
class XLMRobertaCommentClassifier(pl.LightningModule):
    def __init__(self, num_classes: int, num_predictions: int = 5, lr: float = 2e-5):
        super().__init__()
        self.save_hyperparameters()
        self.lm = AutoModel.from_pretrained("uitnlp/CafeBERT")
        self.cls = Linear(1024, self.hparams.num_classes * self.hparams.num_predictions)
        self.criterion = CrossEntropyLoss(ignore_index=-1)
        self.val_f1 = MulticlassF1Score(num_classes=self.hparams.num_classes, average='micro', ignore_index=-1)
        self.test_f1 = MulticlassF1Score(num_classes=self.hparams.num_classes, average='micro', ignore_index=-1)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        x = self.lm(input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"][:, 0, :]
        x = self.cls(x)
        return x

    def _process_batch(self, batch):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        reshaped_logits = logits.view(-1, self.hparams.num_classes)
        reshaped_labels = batch["labels"].view(-1)
        loss = self.criterion(reshaped_logits, reshaped_labels)
        return loss, reshaped_logits, reshaped_labels

    def training_step(self, batch, batch_idx): 
        loss, _, _ = self._process_batch(batch)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx): 
        loss, logits, labels = self._process_batch(batch)
        self.val_f1(logits, labels)
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)
        self.log("val_f1", self.val_f1, on_epoch=True, prog_bar=True)

    def test_step(self, batch, batch_idx):
        loss, logits, labels = self._process_batch(batch)
        self.test_f1(logits, labels)
        self.log("test_loss", loss, on_epoch=True, prog_bar=True)
        self.log("test_f1", self.test_f1, on_epoch=True, prog_bar=True)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.hparams.lr)

In [None]:
class CommentDataset(Dataset):
    def __init__(self, data_path: str, tokenizer: AutoTokenizer, label2id: dict, null_label_id: int, num_predictions: int = 5, max_len: int = 256):
        with open(data_path, "r", encoding="utf-8") as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.null_label_id = null_label_id
        self.num_predictions = num_predictions
        self.max_len = max_len

    def __len__(self): return len(self.data)

    def __getitem__(self, index: int):
        item = self.data[index]
        text = item["text"]
        label_ids = [self.label2id[label[-1].lower()] for label in item["labels"]]
        if len(label_ids) < self.num_predictions:
            label_ids.extend([self.null_label_id] * (self.num_predictions - len(label_ids)))
        else:
            label_ids = label_ids[:self.num_predictions]

        encoding = self.tokenizer(text, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

In [None]:
def main_pipeline():
    config_params = {
        "BATCH_SIZE": 8, "MAX_LEN": 128, "LEARNING_RATE": 2e-5, "EPOCHS": 15,
        "NUM_PREDICTIONS": 5, "ORIGINAL_DATA_FILE": "/kaggle/input/phuc-data/comment_price.json"
    }
    torch.cuda.empty_cache()

    # Chia dữ liệu 
    with open(config_params["ORIGINAL_DATA_FILE"], "r", encoding="utf-8") as f:
        full_data = json.load(f)
    train_data, temp_data = train_test_split(full_data, test_size=0.2, random_state=42)
    dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
    # Lưu vào thư mục gốc như code của bạn
    json.dump(train_data, open("train.json", "w")); json.dump(dev_data, open("dev.json", "w")); json.dump(test_data, open("test.json", "w"))

    # Tạo tokenizer, dataset, dataloader 
    tokenizer = AutoTokenizer.from_pretrained("uitnlp/CafeBERT", use_fast=True)
    null_label_id = LABEL2ID.get("product#null#neutral")
    train_dataset = CommentDataset("train.json", tokenizer, LABEL2ID, null_label_id, config_params["NUM_PREDICTIONS"], config_params["MAX_LEN"])
    dev_dataset = CommentDataset("dev.json", tokenizer, LABEL2ID, null_label_id, config_params["NUM_PREDICTIONS"], config_params["MAX_LEN"])
    test_dataset = CommentDataset("test.json", tokenizer, LABEL2ID, null_label_id, config_params["NUM_PREDICTIONS"], config_params["MAX_LEN"])
    train_loader = DataLoader(train_dataset, batch_size=config_params["BATCH_SIZE"], shuffle=True, num_workers=2)
    dev_loader = DataLoader(dev_dataset, batch_size=config_params["BATCH_SIZE"], num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=config_params["BATCH_SIZE"], num_workers=2)

    # Khởi tạo model và trainer 
    model = XLMRobertaCommentClassifier(num_classes=NUM_CLASSES, num_predictions=config_params["NUM_PREDICTIONS"], lr=config_params["LEARNING_RATE"])
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='checkpoints', filename='best-model', save_top_k=1, mode='min')
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3, mode='min')
    
    # Khai báo CSVLogger tường minh 
    csv_logger = CSVLogger(save_dir="logs/", name="my_model_logs")
    
    trainer = Trainer(max_epochs=config_params["EPOCHS"], accelerator='auto', devices=1,
                      callbacks=[checkpoint_callback, early_stopping_callback], 
                      logger=csv_logger, 
                      precision="16-mixed", accumulate_grad_batches=2)

    # Huấn luyện 
    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=dev_loader)