In [1]:
# %pip install transformers
# %pip install torch
# %pip install scikit-learn
# %pip install numpy
# %pip install pandas

Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.2.5-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-

In [1]:
# BERT + RCNN với PyTorch cho bài toán phân loại văn bản FOODY

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1 Load dữ liệu
train_path = "H:/SentimentAnalystComment/BERT-embedding-CNN/NTC_SV/NTC_SV_train.csv"
test_path = "H:/SentimentAnalystComment/BERT-embedding-CNN/NTC_SV/NTC_SV_test.csv"
df = pd.read_csv(train_path).dropna()
df_test = pd.read_csv(test_path).dropna()

In [3]:

class FoodyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [4]:
# 4. Mô hình BERT + RCNN
class BERT_RCNN(nn.Module):
    def __init__(self, bert_model_name='bert-base-multilingual-cased', hidden_size=768, rnn_hidden=128, num_classes=3):
        super(BERT_RCNN, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bi_gru = nn.GRU(hidden_size, rnn_hidden, bidirectional=True, batch_first=True)
        self.conv1d = nn.Conv1d(2 * rnn_hidden, 128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        rnn_out, _ = self.bi_gru(sequence_output)
        rnn_out = rnn_out.permute(0, 2, 1)
        conv_out = self.relu(self.conv1d(rnn_out))
        pooled = torch.max(conv_out, dim=2)[0]
        return self.fc(pooled)

In [5]:
# 5. Khởi tạo tokenizer, dataset, dataloader
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_dataset = FoodyDataset(df['review'].tolist(), df['label'].tolist(), tokenizer)
test_dataset = FoodyDataset(df_test['review'].tolist(), df_test['label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [6]:
# 6. Khởi tạo mô hình, loss, optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERT_RCNN(num_classes=len(set(df['label']))).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [7]:
# 7. Huấn luyện
num_epochs = 10
save_every = 2
save_dir = "checkpoints"
os.makedirs(save_dir, exist_ok=True)

In [8]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct, total = 0, 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        loop.set_postfix(loss=loss.item())

    acc = correct / total
    print(f"\n✅ Epoch {epoch+1} — Loss: {total_loss/len(train_loader):.4f} — Accuracy: {acc:.2%}")

    if (epoch + 1) % save_every == 0:
        ckpt_path = os.path.join(save_dir, f"bert_rcnn_epoch{epoch+1}.pth")
        torch.save(model.state_dict(), ckpt_path)
        print(f"💾 Đã lưu mô hình tại: {ckpt_path}")

Epoch 1/10:   1%|          | 19/2548 [01:58<4:23:02,  6.24s/it, loss=0.701]


KeyboardInterrupt: 

In [None]:
SAVE_PATH = "./bertrcnn_checkpoints"
os.makedirs(SAVE_PATH, exist_ok=True)

In [None]:
model.save_pretrained(f"{SAVE_PATH}/final_model")
tokenizer.save_pretrained(f"{SAVE_PATH}/final_model")

In [None]:
model.eval()

In [None]:
correct, total = 0, 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"\n📊 Độ chính xác trên tập test: {correct / total:.2%}")