In [1]:
# %pip install spacy spacy-conll
# %pip install transformers torch seqeval
# %%python -m spacy download en_core_web_sm    

In [2]:
import spacy
from spacy.training import Example
from spacy.scorer import Scorer
from spacy import displacy

from datasets import load_dataset, Sequence, ClassLabel

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

In [3]:
nlp = spacy.load("en_core_web_sm")
dataset = load_dataset("lhoestq/conll2003")

labels = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

test_data = dataset['validation']
test_data = test_data.cast_column(
    "ner_tags", 
    Sequence(feature=ClassLabel(names=labels))
)

label_names = test_data.features['ner_tags'].feature.names

In [4]:
y_true = []
y_pred = []
docs_for_display = []

print(f"Evaluando {len(dataset['validation'])} oraciones con seqeval...")

# 2. Procesamiento
for i, entry in enumerate(dataset['validation']):
    words = entry['tokens']
    # Etiquetas reales (mantenemos prefijos B- e I-)
    gold_tags = [label_names[tag] for tag in entry['ner_tags']]
    
    # Predicción de spaCy
    # Creamos un Doc a partir de los tokens existentes para evitar desajustes
    doc = spacy.tokens.Doc(nlp.vocab, words=words)
    for name, proc in nlp.pipeline:
        doc = proc(doc)
    
    # Extraer etiquetas predichas en formato BIO
    pred_tags = []
    for token in doc:
        if token.ent_iob_ == "O":
            pred_tags.append("O")
        else:
            # Reconstruir el formato B-LABEL o I-LABEL
            pred_tags.append(f"{token.ent_iob_}-{token.ent_type_}")

    y_true.append(gold_tags)
    y_pred.append(pred_tags)

    if i < 10:
        docs_for_display.append(doc)

Evaluando 3250 oraciones con seqeval...


In [5]:
print("\n" + "="*60)
print(f"{'RESULTADOS NER (seqeval + BIO)':^60}")
print("="*60)

# Seqeval detecta automáticamente el esquema IOB2
print(classification_report(y_true, y_pred))


               RESULTADOS NER (seqeval + BIO)               


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    CARDINAL       0.00      0.00      0.00         0
        DATE       0.00      0.00      0.00         0
       EVENT       0.00      0.00      0.00         0
         FAC       0.00      0.00      0.00         0
         GPE       0.00      0.00      0.00         0
    LANGUAGE       0.00      0.00      0.00         0
         LAW       0.00      0.00      0.00         0
         LOC       0.72      0.03      0.05      1837
        MISC       0.00      0.00      0.00       922
       MONEY       0.00      0.00      0.00         0
        NORP       0.00      0.00      0.00         0
     ORDINAL       0.00      0.00      0.00         0
         ORG       0.38      0.29      0.33      1341
         PER       0.00      0.00      0.00      1842
     PERCENT       0.00      0.00      0.00         0
      PERSON       0.00      0.00      0.00         0
     PRODUCT       0.00      0.00      0.00         0
    QUANTITY       0.00    

In [6]:
displacy.render(docs_for_display, style="ent")



In [7]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset, Sequence, ClassLabel
from seqeval.metrics import classification_report
import torch
import logging
from typing import List, Dict

class NERProcessor:
    def __init__(self,
                 model_name: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
                 confidence_threshold: float = 0.0): # Bajamos a 0.0 para evaluación comparativa
        self.confidence_threshold = confidence_threshold
        self.device = 0 if torch.cuda.is_available() else -1
        self.nlp_spacy = spacy.blank("en")
        self.ner_pipeline = pipeline("ner", 
                                     model=model_name,
                                     aggregation_strategy="simple",
                                     device=self.device)

    def process_entry(self, words: List[str]) -> List[str]:
        sentence = " ".join(words)
        entities = self.ner_pipeline(sentence)
        
        # Inicializamos etiquetas como 'O'
        predicted_labels = ["O"] * len(words)
        spacy_ents = []

        doc = spacy.tokens.Doc(self.nlp_spacy.vocab, words=words)
        
        for ent in entities:
            if ent['score'] < self.confidence_threshold:
                continue
                
            label_type = ent['entity_group']
            ent_text = ent['word'].replace("##", "").lower()
            
            # Lógica para asignar prefijos B- e I-
            first_match = True
            match_indices = []
            for i, word in enumerate(words):
                if ent_text in word.lower() or word.lower() in ent_text:
                    prefix = "B-" if first_match else "I-"
                    predicted_labels[i] = f"{prefix}{label_type}"
                    first_match = False

 
        
        doc.ents = [e for e in spacy_ents if e is not None]
        return predicted_labels, doc

# --- EJECUCIÓN CON DATASET ---

# 1. Carga y preparación (Casting necesario para labels)
dataset = load_dataset("lhoestq/conll2003", trust_remote_code=True)
labels_info = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
test_data = dataset['validation'].cast_column("ner_tags", Sequence(feature=ClassLabel(names=labels_info)))
label_names = test_data.features['ner_tags'].feature.names

# 2. Instanciar procesador
processor = NERProcessor()

y_true, y_pred, docs_to_show = [], [], []

print("Evaluando 3250 oraciones con NERProcessor y seqeval...")

# 3. Bucle de evaluación
for i in range(3250):
    entry = test_data[i]
    
    # Etiquetas reales (con prefijos)
    gold_labels = [label_names[tag] for tag in entry['ner_tags']]
    
    # Predicciones usando tu clase (reconstruyendo prefijos)
    pred_labels, doc = processor.process_entry(entry['tokens'])
    
    y_true.append(gold_labels)
    y_pred.append(pred_labels)

    if i < 10: docs_to_show.append(doc)

# 4. Reporte Final con seqeval
print("\n" + "="*60)
print(f"{'REPORT MÉTRICAS SEQEVAL':^60}")
print("="*60)
print(classification_report(y_true, y_pred))

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'lhoestq/conll2003' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.





Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Evaluando 3250 oraciones con NERProcessor y seqeval...

                  REPORT MÉTRICAS SEQEVAL                   
              precision    recall  f1-score   support

         LOC       0.65      0.87      0.75      1837
        MISC       0.58      0.83      0.69       922
         ORG       0.65      0.89      0.75      1341
         PER       0.73      0.91      0.81      1842

   micro avg       0.66      0.88      0.76      5942
   macro avg       0.65      0.88      0.75      5942
weighted avg       0.66      0.88      0.76      5942



In [8]:
print(y_pred[:10])

[['O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'O'], ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'I-ORG', 'O'], ['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 