In [1]:
# %pip install torch
# %pip install transformers
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn
# %pip install gradio
# %pip install safetensors

# Import libraries

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from safetensors.torch import save_file
from tqdm import tqdm
import os

2025-04-26 01:39:11.994875: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745631552.189016      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745631552.253464      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
SEQ_LEN = 256
BATCH_SIZE = 16
EPOCHS = 20
LR = 2e-5
SAVE_EVERY = 5

In [5]:
train_df = pd.read_csv("/kaggle/input/ntc-sv/NTC_SV/NTC_SV_train.csv").dropna()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
le = LabelEncoder()
labels = le.fit_transform(train_df["label"])

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [6]:
class FoodyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [7]:
X_train, X_val, y_train, y_val = train_test_split(train_df['review'], labels, test_size=0.15, random_state=42)
train_dataset = FoodyDataset(X_train.tolist(), y_train, tokenizer, SEQ_LEN)
val_dataset = FoodyDataset(X_val.tolist(), y_val, tokenizer, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [8]:
test_df = pd.read_csv("/kaggle/input/ntc-sv/NTC_SV/NTC_SV_test.csv").dropna()
test_texts = test_df['review'].tolist()
test_labels = le.transform(test_df['label'])
test_dataset = FoodyDataset(test_texts, test_labels, tokenizer, SEQ_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [9]:
class BERT_RCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.conv1 = nn.Conv1d(768, 256, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.rnn = nn.LSTM(256, 128, num_layers=1, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128*2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        x = bert_out.permute(0, 2, 1)
        x = self.relu(self.conv1(x))
        x = x.permute(0, 2, 1)
        x, _ = self.rnn(x)
        x = torch.mean(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)

In [10]:
model = BERT_RCNN(num_classes=len(set(labels))).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [11]:
os.makedirs("checkpoints", exist_ok=True)

In [12]:
def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    return total_loss / len(dataloader), acc

In [13]:
for epoch in range(EPOCHS):
    loss, acc = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss:.4f} - Accuracy: {acc:.4f}")
    if (epoch + 1) % SAVE_EVERY == 0:
        save_path = f"checkpoints/bert_rcnn_epoch{epoch+1}.safetensors"
        save_file(model.state_dict(), save_path)
        print(f"✔️ Saved checkpoint at {save_path}")


100%|██████████| 2166/2166 [05:16<00:00,  6.85it/s]


Epoch 1/20 - Loss: 0.4655 - Accuracy: 0.7805


100%|██████████| 2166/2166 [05:14<00:00,  6.88it/s]


Epoch 2/20 - Loss: 0.4160 - Accuracy: 0.8103


100%|██████████| 2166/2166 [05:14<00:00,  6.88it/s]


Epoch 3/20 - Loss: 0.4027 - Accuracy: 0.8193


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 4/20 - Loss: 0.3966 - Accuracy: 0.8201


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 5/20 - Loss: 0.3891 - Accuracy: 0.8235
✔️ Saved checkpoint at checkpoints/bert_rcnn_epoch5.safetensors


100%|██████████| 2166/2166 [05:15<00:00,  6.88it/s]


Epoch 6/20 - Loss: 0.3852 - Accuracy: 0.8277


100%|██████████| 2166/2166 [05:14<00:00,  6.88it/s]


Epoch 7/20 - Loss: 0.3787 - Accuracy: 0.8321


100%|██████████| 2166/2166 [05:14<00:00,  6.88it/s]


Epoch 8/20 - Loss: 0.3716 - Accuracy: 0.8357


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 9/20 - Loss: 0.3630 - Accuracy: 0.8408


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 10/20 - Loss: 0.3580 - Accuracy: 0.8445
✔️ Saved checkpoint at checkpoints/bert_rcnn_epoch10.safetensors


100%|██████████| 2166/2166 [05:15<00:00,  6.86it/s]


Epoch 11/20 - Loss: 0.3528 - Accuracy: 0.8473


100%|██████████| 2166/2166 [05:15<00:00,  6.86it/s]


Epoch 12/20 - Loss: 0.3461 - Accuracy: 0.8498


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 13/20 - Loss: 0.3406 - Accuracy: 0.8518


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 14/20 - Loss: 0.3345 - Accuracy: 0.8568


100%|██████████| 2166/2166 [05:15<00:00,  6.87it/s]


Epoch 15/20 - Loss: 0.3263 - Accuracy: 0.8608
✔️ Saved checkpoint at checkpoints/bert_rcnn_epoch15.safetensors


100%|██████████| 2166/2166 [05:16<00:00,  6.84it/s]


Epoch 16/20 - Loss: 0.3245 - Accuracy: 0.8631


100%|██████████| 2166/2166 [05:16<00:00,  6.84it/s]


Epoch 17/20 - Loss: 0.3169 - Accuracy: 0.8671


100%|██████████| 2166/2166 [05:16<00:00,  6.84it/s]


Epoch 18/20 - Loss: 0.3152 - Accuracy: 0.8660


100%|██████████| 2166/2166 [05:16<00:00,  6.85it/s]


Epoch 19/20 - Loss: 0.3083 - Accuracy: 0.8712


100%|██████████| 2166/2166 [05:16<00:00,  6.85it/s]


Epoch 20/20 - Loss: 0.3022 - Accuracy: 0.8746
✔️ Saved checkpoint at checkpoints/bert_rcnn_epoch20.safetensors


In [14]:
model.eval()
preds, trues = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        pred = torch.argmax(outputs, dim=1)
        preds.extend(pred.cpu().numpy())
        trues.extend(labels.cpu().numpy())

print("=== Test Result ===")
print(classification_report(trues, preds))

=== Test Result ===
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      5000
           1       0.88      0.86      0.87      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [15]:
save_file(model.state_dict(), "bert_rcnn_foody_final.safetensors")