Đọc file train_data.json

In [1]:
import json
import pandas as pd

# Đọc train data
with open("/kaggle/input/ai-1904-dpl-302-m-topic-sentiment-classification/train data.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# Tách text, sentiment, topic
texts, sentiments, topics = [], [], []
for item in train_data:
    text = item["data"]["text"]
    sent, topic_list = None, []

    for ann in item["annotations"]:
        if ann["from_name"] == "sentiment":
            sent = ann["value"]["choices"][0]
        elif ann["from_name"] == "topic":
            topic_list.extend(ann["value"]["choices"])

    if sent:
        texts.append(text)
        sentiments.append(sent)
        topics.append(topic_list)

print(f"Số mẫu huấn luyện: {len(texts)}")

# Đọc test
test_df = pd.read_csv("/kaggle/input/ai-1904-dpl-302-m-topic-sentiment-classification/test.csv")
test_texts = test_df["text"].tolist()


Số mẫu huấn luyện: 1684


 2. Khởi tạo PhoBERT + Tokenizer

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

3. Xử lý label

In [3]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# Cảm xúc: Tiêu cực, Trung tính, Tích cực
le_sent = LabelEncoder()
sent_labels = le_sent.fit_transform(sentiments)

# Chủ đề: multi-label
mlb_topic = MultiLabelBinarizer()
topic_labels = mlb_topic.fit_transform(topics)
topic_names = mlb_topic.classes_


4. Dataset dùng chung

In [4]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, is_multilabel=False, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.is_multilabel = is_multilabel
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.float if self.is_multilabel else torch.long)
        }


 5. Model Sentiment và Topic

In [5]:
import torch.nn as nn
from transformers import AutoModel

class PhoBERT_Classifier(nn.Module):
    def __init__(self, num_labels, is_multilabel=False):
        super().__init__()
        self.phobert = AutoModel.from_pretrained("vinai/phobert-base")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_labels)
        self.is_multilabel = is_multilabel

    def forward(self, input_ids, attention_mask):
        output = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0, :]
        cls_token = self.dropout(cls_token)
        logits = self.classifier(cls_token)
        return logits


6. Huấn luyện mô hình sentiment

In [6]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
import copy

# Dataset và mô hình
sent_dataset = TextDataset(texts, sent_labels, tokenizer)
sent_loader = DataLoader(sent_dataset, batch_size=16, shuffle=True)
model_sent = PhoBERT_Classifier(num_labels=3).to("cuda")

# Huấn luyện
optimizer = torch.optim.AdamW(model_sent.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()
best_loss, patience = float("inf"), 0

for epoch in range(20):
    model_sent.train()
    total_loss = 0
    for batch in sent_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["label"].to("cuda")

        optimizer.zero_grad()
        logits = model_sent(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg = total_loss / len(sent_loader)
    print(f"[Sentiment Epoch {epoch+1}] Loss: {avg:.4f}")
    if avg < best_loss:
        best_loss = avg
        best_sent = copy.deepcopy(model_sent.state_dict())
        patience = 0
    else:
        patience += 1
        if patience >= 3: break

model_sent.load_state_dict(best_sent)


2025-07-18 15:00:43.366820: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752850843.548487      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752850843.605791      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

[Sentiment Epoch 1] Loss: 0.9289
[Sentiment Epoch 2] Loss: 0.7327
[Sentiment Epoch 3] Loss: 0.5957
[Sentiment Epoch 4] Loss: 0.4264
[Sentiment Epoch 5] Loss: 0.3036
[Sentiment Epoch 6] Loss: 0.2087
[Sentiment Epoch 7] Loss: 0.1451
[Sentiment Epoch 8] Loss: 0.0903
[Sentiment Epoch 9] Loss: 0.0741
[Sentiment Epoch 10] Loss: 0.0825
[Sentiment Epoch 11] Loss: 0.0549
[Sentiment Epoch 12] Loss: 0.0500
[Sentiment Epoch 13] Loss: 0.0539
[Sentiment Epoch 14] Loss: 0.0289
[Sentiment Epoch 15] Loss: 0.0392
[Sentiment Epoch 16] Loss: 0.0458
[Sentiment Epoch 17] Loss: 0.0740


<All keys matched successfully>

7. Huấn luyện mô hình topic (multi-label)

In [7]:
topic_dataset = TextDataset(texts, topic_labels, tokenizer, is_multilabel=True)
topic_loader = DataLoader(topic_dataset, batch_size=16, shuffle=True)
model_topic = PhoBERT_Classifier(num_labels=len(topic_names), is_multilabel=True).to("cuda")

optimizer = torch.optim.AdamW(model_topic.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()
best_loss, patience = float("inf"), 0

for epoch in range(20):
    model_topic.train()
    total_loss = 0
    for batch in topic_loader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["label"].to("cuda")

        optimizer.zero_grad()
        logits = model_topic(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg = total_loss / len(topic_loader)
    print(f"[Topic Epoch {epoch+1}] Loss: {avg:.4f}")
    if avg < best_loss:
        best_loss = avg
        best_topic = copy.deepcopy(model_topic.state_dict())
        patience = 0
    else:
        patience += 1
        if patience >= 3: break

model_topic.load_state_dict(best_topic)


[Topic Epoch 1] Loss: 0.3753
[Topic Epoch 2] Loss: 0.2903
[Topic Epoch 3] Loss: 0.2656
[Topic Epoch 4] Loss: 0.2502
[Topic Epoch 5] Loss: 0.2322
[Topic Epoch 6] Loss: 0.2109
[Topic Epoch 7] Loss: 0.1929
[Topic Epoch 8] Loss: 0.1768
[Topic Epoch 9] Loss: 0.1617
[Topic Epoch 10] Loss: 0.1481
[Topic Epoch 11] Loss: 0.1355
[Topic Epoch 12] Loss: 0.1252
[Topic Epoch 13] Loss: 0.1149
[Topic Epoch 14] Loss: 0.1054
[Topic Epoch 15] Loss: 0.0972
[Topic Epoch 16] Loss: 0.0897
[Topic Epoch 17] Loss: 0.0834
[Topic Epoch 18] Loss: 0.0766
[Topic Epoch 19] Loss: 0.0715
[Topic Epoch 20] Loss: 0.0667


<All keys matched successfully>

 8. Dự đoán test + Xuất file CSV

In [8]:
model_sent.eval()
model_topic.eval()

# Sentiment
sent_preds = []
with torch.no_grad():
    for i in range(0, len(test_texts), 16):
        enc = tokenizer(test_texts[i:i+16], padding=True, truncation=True, max_length=256, return_tensors="pt")
        input_ids = enc["input_ids"].to("cuda")
        attention_mask = enc["attention_mask"].to("cuda")
        logits = model_sent(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        sent_preds.extend(preds)

pred_sentiments = le_sent.inverse_transform(sent_preds)

# Topic
topic_preds = []
with torch.no_grad():
    for i in range(0, len(test_texts), 16):
        enc = tokenizer(test_texts[i:i+16], padding=True, truncation=True, max_length=256, return_tensors="pt")
        input_ids = enc["input_ids"].to("cuda")
        attention_mask = enc["attention_mask"].to("cuda")
        logits = model_topic(input_ids, attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        labels = (probs > 0.4).astype(int)
        for row in labels:
            selected_topics = [topic_names[i] for i in range(len(topic_names)) if row[i] == 1]
            if selected_topics:
              topics_str = ";".join(selected_topics)
            else:
              topics_str = "unknown"
            topic_preds.append(topics_str)


# Gộp và xuất
submission = pd.DataFrame({
    "id": test_df["id"],
    "sentiment": pred_sentiments,
    "topic": topic_preds
})
submission.to_csv("submission187.csv", index=False, encoding="utf-8-sig")
print("dei")





dei
