In [6]:
!pip install transformers datasets torch scikit-learn seqeval





In [11]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

# -------------------------------
# 1. Load Dataset (Real-world News Data)
# -------------------------------
# CoNLL-2003 contains news articles
dataset = load_dataset("conll2003")

# Use small test subset for quick evaluation
# Explicitly convert the Dataset subset to a list of dictionaries (rows)
test_data = list(dataset["test"].select(range(200)))

# -------------------------------
# 2. Load Pretrained NER Model
# -------------------------------
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# -------------------------------
# 3. Convert labels to readable format
# -------------------------------
label_list = dataset["train"].features["ner_tags"].feature.names

def get_true_labels(tokens, ner_tags):
    return [label_list[tag] for tag in ner_tags]

# -------------------------------
# 4. Run Predictions
# -------------------------------
true_labels = []
pred_labels = []

for sample in test_data:
    tokens = sample["tokens"]
    true = get_true_labels(tokens, sample["ner_tags"])

    text = " ".join(tokens)
    preds = ner_pipeline(text)

    # Initialize predictions with "O"
    pred = ["O"] * len(tokens)

    # Calculate character offsets for the original tokens in the joined text
    token_char_offsets = []
    current_offset = 0
    for token in tokens:
        token_char_offsets.append((current_offset, current_offset + len(token)))
        current_offset += len(token) + 1 # +1 for the space

    # Fill predicted entities using IOB2 scheme
    for entity in preds:
        entity_label = entity["entity_group"]
        entity_start_char = entity["start"]
        entity_end_char = entity["end"]

        # Find the token indices that correspond to this entity's character span
        entity_token_indices = []
        for i, (token_start_char, token_end_char) in enumerate(token_char_offsets):
            # A token is considered part of an entity if there is any overlap
            if entity_start_char < token_end_char and entity_end_char > token_start_char:
                entity_token_indices.append(i)

        if entity_token_indices:
            # Assign B- tag to the first token of the entity
            pred[entity_token_indices[0]] = f"B-{entity_label}"
            # Assign I- tag to subsequent tokens of the entity
            for idx in entity_token_indices[1:]:
                pred[idx] = f"I-{entity_label}"

    true_labels.append(true)
    pred_labels.append(pred)

# -------------------------------
# 5. Evaluation Metrics
# -------------------------------
print("\nðŸ“Š NER Evaluation Results:\n")

# The seqeval accuracy_score expects a list of lists of labels for comparison
# It measures token-level accuracy (O vs. B-TAG, I-TAG)
# If a token is O in both true and pred, it's correct. If it's B-LOC in both, it's correct.
flat_true_labels = [label for sublist in true_labels for label in sublist]
flat_pred_labels = [label for sublist in pred_labels for label in sublist]
print("Accuracy :", accuracy_score([flat_true_labels], [flat_pred_labels]))

print("Precision:", precision_score(true_labels, pred_labels))
print("Recall   :\t", recall_score(true_labels, pred_labels))
print("F1-score :\t", f1_score(true_labels, pred_labels))

print("\nDetailed Report:\n")
print(classification_report(true_labels, pred_labels))




  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0



ðŸ“Š NER Evaluation Results:

Accuracy : 0.9790291262135923
Precision: 0.8758620689655172
Recall   :	 0.9361179361179361
F1-score :	 0.9049881235154394

Detailed Report:

              precision    recall  f1-score   support

         LOC       0.98      0.98      0.98       130
        MISC       0.77      0.86      0.81        35
         ORG       0.84      0.88      0.86        49
         PER       0.84      0.93      0.88       193

   micro avg       0.88      0.94      0.90       407
   macro avg       0.86      0.91      0.88       407
weighted avg       0.88      0.94      0.91       407



In [12]:
custom_text = """
Apple CEO Tim Cook visited India and met Prime Minister Narendra Modi in New Delhi.
"""

print("\nCustom NER Output:")
for ent in ner_pipeline(custom_text):
    print(ent["word"], "â†’", ent["entity_group"])



Custom NER Output:
Apple â†’ ORG
Tim Cook â†’ PER
India â†’ LOC
Na â†’ PER
##rendra Modi â†’ PER
New Delhi â†’ LOC
