In [1]:
# Extended NER dataset with more examples and diverse entities
train_data = [
    # Original examples
    {
        "text": "John works at Google in New York",
        "entities": [(0, 4, "PER"), (14, 20, "ORG"), (24, 32, "LOC")]
    },
    {
        "text": "Microsoft announced new AI features yesterday",
        "entities": [(0, 9, "ORG")]
    },
    {
        "text": "Sarah visited Paris last summer",
        "entities": [(0, 5, "PER"), (13, 18, "LOC")]
    },

    # Tech domain examples
    {
        "text": "Sundar Pichai leads Alphabet and Google in Silicon Valley",
        "entities": [(0, 13, "PER"), (20, 28, "ORG"), (33, 39, "ORG"), (43, 56, "LOC")]
    },
    {
        "text": "Tesla's Elon Musk announced plans to expand operations in Shanghai",
        "entities": [(0, 5, "ORG"), (7, 16, "PER"), (49, 57, "LOC")]
    },
    {
        "text": "Amazon Web Services opened new data centers in Mumbai",
        "entities": [(0, 19, "ORG"), (47, 53, "LOC")]
    },

    # Business/Finance examples
    {
        "text": "JPMorgan Chase CEO Jamie Dimon spoke at the World Economic Forum in Davos",
        "entities": [(0, 14, "ORG"), (19, 30, "PER"), (42, 62, "ORG"), (66, 71, "LOC")]
    },
    {
        "text": "Goldman Sachs opened its new office in London last month",
        "entities": [(0, 13, "ORG"), (37, 43, "LOC")]
    },

    # Sports examples
    {
        "text": "Lionel Messi scored two goals for Inter Miami at DRV PNK Stadium",
        "entities": [(0, 12, "PER"), (31, 42, "ORG"), (46, 61, "LOC")]
    },
    {
        "text": "Manchester United defeated Arsenal at Emirates Stadium",
        "entities": [(0, 16, "ORG"), (25, 32, "ORG"), (36, 52, "LOC")]
    },

    # Entertainment examples
    {
        "text": "Tom Cruise attended the premiere in Hollywood",
        "entities": [(0, 10, "PER"), (35, 44, "LOC")]
    },
    {
        "text": "Netflix released a new series produced by Warner Bros in Los Angeles",
        "entities": [(0, 7, "ORG"), (39, 50, "ORG"), (54, 65, "LOC")]
    },

    # Political examples
    {
        "text": "President Biden met with Chancellor Scholz at the White House",
        "entities": [(10, 15, "PER"), (31, 37, "PER"), (45, 56, "LOC")]
    },
    {
        "text": "The United Nations headquarters in New York hosted the summit",
        "entities": [(4, 18, "ORG"), (34, 42, "LOC")]
    },

    # Academic/Research examples
    {
        "text": "Dr. Smith from Stanford University published groundbreaking research at MIT",
        "entities": [(4, 9, "PER"), (15, 33, "ORG"), (63, 66, "ORG")]
    },
    {
        "text": "Harvard researchers collaborated with Oxford University on AI development",
        "entities": [(0, 7, "ORG"), (36, 52, "ORG")]
    },

    # Mixed complex examples
    {
        "text": "While Mark Zuckerberg was presenting at Facebook, Tim Cook announced Apple's latest iPhone in California",
        "entities": [(6, 21, "PER"), (39, 47, "ORG"), (49, 57, "PER"), (68, 73, "ORG"), (91, 101, "LOC")]
    },
    {
        "text": "The BBC reported that Samsung's CEO visited their facilities in Seoul and Tokyo",
        "entities": [(4, 7, "ORG"), (21, 28, "ORG"), (51, 56, "LOC"), (61, 66, "LOC")]
    },
    {
        "text": "Deutsche Bank's Frankfurt office contacted Morgan Stanley in London regarding the merger",
        "entities": [(0, 13, "ORG"), (15, 24, "LOC"), (35, 48, "ORG"), (52, 58, "LOC")]
    }
]

# Define label to ID mapping
label2id = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6
}

id2label = {v: k for k, v in label2id.items()}

In [2]:
from transformers import AutoTokenizer, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
import numpy as np

In [3]:
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        entities = item["entities"]

        # Create label sequence
        labels = ["O"] * len(text)
        for start, end, label in entities:
            labels[start] = f"B-{label}"
            for i in range(start + 1, end):
                labels[i] = f"I-{label}"

        # Tokenize text and align labels
        tokenized = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_offsets_mapping=True
        )

        aligned_labels = [-100] * len(tokenized["input_ids"])  # -100 is ignored in loss calculation
        offset_mapping = tokenized["offset_mapping"]

        for idx, (start, end) in enumerate(offset_mapping):
            if start == end:  # Special tokens
                continue
            # Find the label for this token
            token_label = labels[start]
            aligned_labels[idx] = label2id[token_label]

        return {
            "input_ids": torch.tensor(tokenized["input_ids"]),
            "attention_mask": torch.tensor(tokenized["attention_mask"]),
            "labels": torch.tensor(aligned_labels)
        }


In [4]:
def train_model(model, train_dataloader, num_epochs=3, device="cuda"):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    model.train()
    model.to(device)

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

In [5]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.eos_token = tokenizer.pad_token

model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Create dataset and dataloader
dataset = NERDataset(train_data, tokenizer)
train_dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [7]:
# Train the model
device = "cuda" if torch.cuda.is_available() else "cpu"
train_model(model, train_dataloader, num_epochs=25, device=device)

Epoch 1/25, Average Loss: 1.8040
Epoch 2/25, Average Loss: 1.2346
Epoch 3/25, Average Loss: 0.9982
Epoch 4/25, Average Loss: 0.7811
Epoch 5/25, Average Loss: 0.6455
Epoch 6/25, Average Loss: 0.5155
Epoch 7/25, Average Loss: 0.4112
Epoch 8/25, Average Loss: 0.3495
Epoch 9/25, Average Loss: 0.2419
Epoch 10/25, Average Loss: 0.2102
Epoch 11/25, Average Loss: 0.1690
Epoch 12/25, Average Loss: 0.1249
Epoch 13/25, Average Loss: 0.1352
Epoch 14/25, Average Loss: 0.0793
Epoch 15/25, Average Loss: 0.0729
Epoch 16/25, Average Loss: 0.0570
Epoch 17/25, Average Loss: 0.0448
Epoch 18/25, Average Loss: 0.0334
Epoch 19/25, Average Loss: 0.0276
Epoch 20/25, Average Loss: 0.0282
Epoch 21/25, Average Loss: 0.0235
Epoch 22/25, Average Loss: 0.0218
Epoch 23/25, Average Loss: 0.0175
Epoch 24/25, Average Loss: 0.0187
Epoch 25/25, Average Loss: 0.0150


In [8]:
# Save the model
model.save_pretrained("bert-ner-finetuned")
tokenizer.save_pretrained("bert-ner-finetuned")

('bert-ner-finetuned/tokenizer_config.json',
 'bert-ner-finetuned/special_tokens_map.json',
 'bert-ner-finetuned/vocab.txt',
 'bert-ner-finetuned/added_tokens.json',
 'bert-ner-finetuned/tokenizer.json')

In [9]:
def predict(text, model, tokenizer, device="cuda"):
    model.eval()
    # Tokenize the text
    tokenized = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    offset_mapping = tokenized.offset_mapping[0].tolist()

    with torch.no_grad():
        input_ids = tokenized["input_ids"].to(device)
        attention_mask = tokenized["attention_mask"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=2)

    # Get predictions for each token
    predictions = predictions[0].cpu().numpy()
    offset_mapping = tokenized["offset_mapping"][0].numpy()

    # Initialize list to store entities
    entities = []
    current_entity = None

    for idx, (pred, (start_char, end_char)) in enumerate(zip(predictions, offset_mapping)):
        # Skip special tokens ([CLS], [SEP], padding)
        if start_char == 0 and end_char == 0:
            continue

        pred_label = id2label[pred]

        # If we find a B- tag, start a new entity
        if pred_label.startswith("B-"):
            if current_entity:
                entities.append(current_entity)
            entity_text = text[start_char:end_char]
            current_entity = {
                "type": pred_label[2:],
                "start": start_char,
                "end": end_char,
                "text": entity_text
            }
        # If we find an I- tag, append to the current entity
        elif pred_label.startswith("I-") and current_entity:
            current_entity["end"] = end_char
            current_entity["text"] = text[current_entity["start"]:end_char]
        # If we find an O tag, close the current entity
        elif pred_label == "O" and current_entity:
            entities.append(current_entity)
            current_entity = None

    # Add the last entity if it exists
    if current_entity:
        entities.append(current_entity)

    return entities

# Example usage:
def test_prediction(model, tokenizer, text):
    print(f"\nInput text: {text}")
    entities = predict(text, model, tokenizer, device='cpu')
    print("\nDetected entities:")
    for entity in entities:
        print(f"- {entity['type']}: '{entity['text']}' (positions {entity['start']}:{entity['end']})")


In [10]:
# Load model and tokenizer
model = BertForTokenClassification.from_pretrained("bert-ner-finetuned")
tokenizer = AutoTokenizer.from_pretrained("bert-ner-finetuned")

# Test with various examples
test_texts = [
    "John works at Google in New York",
    "Microsoft CEO Satya Nadella visited Seattle",
    "Apple announced new products in California yesterday",
]

for text in test_texts:
    test_prediction(model, tokenizer, text)


Input text: John works at Google in New York

Detected entities:
- PER: 'John' (positions 0:4)
- ORG: 'Google' (positions 14:20)
- LOC: 'New York' (positions 24:32)

Input text: Microsoft CEO Satya Nadella visited Seattle

Detected entities:
- ORG: 'Microsoft' (positions 0:9)

Input text: Apple announced new products in California yesterday

Detected entities:
- ORG: 'Apple' (positions 0:5)
