In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

torch.manual_seed(42)
np.random.seed(42)

BATCH_SIZE = 8
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 128
MODEL_NAME = "bert-base-cased"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

few_nerd = load_dataset("DFKI-SLT/few-nerd", name="supervised")
train_dataset = few_nerd["train"]
val_dataset = few_nerd["validation"]
test_dataset = few_nerd["test"]

all_labels = set()
for example in train_dataset:
    all_labels.update(example["ner_tags"])

id2label = {i: f"TAG_{i}" for i in sorted(all_labels)}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
print(f"Number of labels: {num_labels}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class NERDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokens = self.dataset[idx]["tokens"]
        labels = self.dataset[idx]["ner_tags"]

        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        word_ids = encoding.word_ids(batch_index=0)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label_ids, dtype=torch.long)
        }

train_data = NERDataset(train_dataset, tokenizer, MAX_SEQ_LENGTH)
val_data = NERDataset(val_dataset, tokenizer, MAX_SEQ_LENGTH)
test_data = NERDataset(test_dataset, tokenizer, MAX_SEQ_LENGTH)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
model.to(DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

def compute_loss(outputs, labels):
    loss_fct = nn.CrossEntropyLoss()
    logits = outputs.logits.view(-1, model.num_labels)
    labels = labels.view(-1)
    return loss_fct(logits, labels)

def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    predictions = []
    ground_truths = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=2)

            for i in range(labels.shape[0]):
                for j in range(labels.shape[1]):
                    if labels[i, j] != -100:
                        predictions.append(preds[i, j].item())
                        ground_truths.append(labels[i, j].item())

    report = classification_report(ground_truths, predictions, target_names=list(id2label.values()), digits=4, zero_division=0)
    return total_loss / len(dataloader), report, predictions, ground_truths

train_losses = []
val_losses = []
best_val_loss = float("inf")

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")

    train_loss = train_epoch(model, train_loader, optimizer)
    val_loss, val_report, _, _ = evaluate(model, val_loader)

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f}")
    print("Validation Classification Report:")
    print(val_report)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "bert_ner_fewnerd_best.pt")
        print("✅ Best model saved!")

plt.figure(figsize=(8, 5))
plt.plot(range(1, NUM_EPOCHS + 1), train_losses, label="Train Loss")
plt.plot(range(1, NUM_EPOCHS + 1), val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid()
plt.savefig("bert_loss_history.png")
plt.show()

model.load_state_dict(torch.load("bert_ner_fewnerd_best.pt"))
print("Testing the best model...")
test_loss, test_report, test_preds, test_labels = evaluate(model, test_loader)
print(f"Test Loss: {test_loss:.4f}")
print("Test Classification Report:")
print(test_report)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

few-nerd.py:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/4.84M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/131767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18824 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/37648 [00:00<?, ? examples/s]

Number of labels: 9


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 16471/16471 [54:27<00:00,  5.04it/s]
Evaluating: 100%|██████████| 2353/2353 [03:26<00:00, 11.41it/s]


Train Loss: 0.1775 | Validation Loss: 0.1609
Validation Classification Report:
              precision    recall  f1-score   support

       TAG_0     0.9739    0.9846    0.9792    365403
       TAG_1     0.8716    0.8066    0.8378      6066
       TAG_2     0.8291    0.6808    0.7477      6964
       TAG_3     0.7825    0.7714    0.7769      6003
       TAG_4     0.8198    0.8906    0.8537     20399
       TAG_5     0.8075    0.8102    0.8088     23266
       TAG_6     0.8345    0.6112    0.7056      9481
       TAG_7     0.9229    0.9363    0.9296     19005
       TAG_8     0.8725    0.6139    0.7207      6099

    accuracy                         0.9475    462686
   macro avg     0.8572    0.7895    0.8178    462686
weighted avg     0.9465    0.9475    0.9461    462686

✅ Best model saved!

Epoch 2/3


Training: 100%|██████████| 16471/16471 [54:28<00:00,  5.04it/s]
Evaluating: 100%|██████████| 2353/2353 [03:26<00:00, 11.42it/s]


Train Loss: 0.1426 | Validation Loss: 0.1573
Validation Classification Report:
              precision    recall  f1-score   support

       TAG_0     0.9814    0.9800    0.9807    365403
       TAG_1     0.8331    0.8363    0.8347      6066
       TAG_2     0.7699    0.7631    0.7665      6964
       TAG_3     0.7956    0.7571    0.7759      6003
       TAG_4     0.8414    0.8785    0.8596     20399
       TAG_5     0.8203    0.8022    0.8111     23266
       TAG_6     0.7468    0.7745    0.7604      9481
       TAG_7     0.9166    0.9409    0.9286     19005
       TAG_8     0.7989    0.7437    0.7703      6099

    accuracy                         0.9496    462686
   macro avg     0.8338    0.8307    0.8320    462686
weighted avg     0.9497    0.9496    0.9496    462686

✅ Best model saved!

Epoch 3/3


Training:  84%|████████▎ | 13776/16471 [45:32<08:53,  5.06it/s]

In [None]:
!pip install torch datasets transformers numpy scikit-learn matplotlib tqdm pandas

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 