In [1]:
# %pip install spacy spacy-conll
# %pip install transformers torch seqeval
# %%python -m spacy download en_core_web_sm    

In [2]:
import spacy
from spacy.training import Example
from spacy.scorer import Scorer


import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

In [3]:
def read_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                parts = token.split() # Divide por cualquier espacio en blanco
                if len(parts) >= 4:
                    # parts[0] es la palabra, parts[3] es la etiqueta NER completa
                    word = parts[0]
                    full_label = parts[3] 
                    
                    # Quitamos los prefijos B- o I- para que no ensucien la métrica
                    clean_label = full_label.replace("B-", "").replace("I-", "")
                    
                    token_data.append([word, clean_label])
            if token_data:
                data.append(token_data)
    return data

In [4]:
def get_spacy_format(data):
    """Convierte el formato de columnas CoNLL a (texto, {'entities': [(s, e, label)]})"""
    formatted_data = []
    for sentence in data:
        full_text = ""
        entities = []
        current_pos = 0
        
        for token_parts in sentence:
            # En eng.testa: [0]=Palabra, [-1]=Etiqueta NER
            word = token_parts[0]
            ner_tag = token_parts[-1]
            
            start = current_pos
            end = start + len(word)
            full_text += word + " "
            current_pos = end + 1 # +1 por el espacio
            
            if ner_tag != 'O':
                # Limpiamos el prefijo B- o I- para dejar solo la etiqueta (LOC, ORG, etc.)
                label = ner_tag.split('-')[-1]
                entities.append((start, end, label))
        
        formatted_data.append((full_text.strip(), {"entities": entities}))
    return formatted_data

In [5]:
nlp = spacy.load("en_core_web_sm")
raw_data = read_conll_file(r"C:\Users\arbed\Downloads\archive\conll2003\eng.testa")
processed_data = get_spacy_format(raw_data)

In [6]:
scorer = Scorer()
examples = []

print(f"Procesando {len(processed_data)} oraciones...")

for text, annotations in processed_data:
    # El modelo predice sobre el texto
    doc_pred = nlp(text)
    # Creamos el objeto Example para comparar predicción vs realidad
    try:
        example = Example.from_dict(doc_pred, annotations)
        examples.append(example)
    except Exception as e:
        # Ignorar si hay desajustes menores en tokens
        continue

Procesando 3466 oraciones...


In [7]:
scores = scorer.score(examples)

print("\n" + "="*50)
print(f"{'RESULTADOS NER (en_core_web_sm)':^50}")
print("="*50)

# Métricas por tipo de entidad
print(f"\n{'Entidad':<15} | {'Precisión':<10} | {'Recall':<10} | {'F1-Score':<10}")
print("-" * 50)

for label, m in scores["ents_per_type"].items():
    print(f"{label:<15} | {m['p']:>9.2%} | {m['r']:>9.2%} | {m['f']:>9.2%}")

# Métricas globales
print("-" * 50)
print(f"GLOBAL F1-SCORE:  {scores['ents_f']:>8.2%}")
print(f"GLOBAL PRECISION: {scores['ents_p']:>8.2%}")
print(f"GLOBAL RECALL:    {scores['ents_r']:>8.2%} (Exactitud)")
print("="*50)


         RESULTADOS NER (en_core_web_sm)          

Entidad         | Precisión  | Recall     | F1-Score  
--------------------------------------------------
ORG             |    28.56% |    13.53% |    18.36%
GPE             |     0.00% |     0.00% |     0.00%
DATE            |     0.00% |     0.00% |     0.00%
LOC             |    61.97% |     2.10% |     4.06%
NORP            |     0.00% |     0.00% |     0.00%
PERSON          |     0.00% |     0.00% |     0.00%
CARDINAL        |     0.00% |     0.00% |     0.00%
FAC             |     0.00% |     0.00% |     0.00%
MISC            |     0.00% |     0.00% |     0.00%
PER             |     0.00% |     0.00% |     0.00%
TIME            |     0.00% |     0.00% |     0.00%
ORDINAL         |     0.00% |     0.00% |     0.00%
LANGUAGE        |     0.00% |     0.00% |     0.00%
LAW             |     0.00% |     0.00% |     0.00%
MONEY           |     0.00% |     0.00% |     0.00%
EVENT           |     0.00% |     0.00% |     0.00%
PERCENT  

In [8]:
model_name = "dslim/bert-base-NER" # Modelo optimizado para CoNLL-2003
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp_bert = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def read_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            words, labels = [], []
            for token in tokens:
                parts = token.split()
                if len(parts) > 0:
                    words.append(parts[0])
                    labels.append(parts[-1])
            if words:
                data.append({"words": words, "labels": labels})
    return data

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [9]:
raw_data = read_conll_file(r"C:\Users\arbed\Downloads\archive\conll2003\eng.testa")
y_true = []
y_pred = []

print(f"Evaluando BERT sobre {len(raw_data)} oraciones...")

Evaluando BERT sobre 3466 oraciones...


In [10]:
for entry in raw_data[:200]: # Limitamos a 200 para rapidez, puedes quitar el slice
    sentence = " ".join(entry['words'])
    gold_labels = entry['labels']
    
    # Predicción de BERT
    outputs = nlp_bert(sentence)
    
    # BERT devuelve entidades encontradas, necesitamos mapearlas 
    # de nuevo a la estructura original de palabras para comparar
    predicted_labels = ["O"] * len(entry['words'])
    
    for ent in outputs:
        # Buscamos a qué palabra(s) corresponde la entidad detectada
        # (Aproximación simple por coincidencia de texto)
        for i, word in enumerate(entry['words']):
            if ent['word'] in word or word in ent['word']:
                predicted_labels[i] = ent['entity_group']
    
    # Estandarizar etiquetas (BERT usa 'PER', el dataset 'B-PER')
    y_true.append(gold_labels)
    # Limpiamos prefijos B- e I- de la verdad para comparar con la salida simple de BERT
    # y_true[-1] = [label.split('-')[-1] for label in y_true[-1]]
    y_pred.append(predicted_labels)

In [11]:
target_names = sorted(list(set([lbl for sublist in y_true for lbl in sublist])))

In [12]:
print("\n" + "="*50)
print(f"{'REPORT DE MÉTRICAS BERT NER':^50}")
print("="*50)

print(classification_report(y_true, y_pred))

print("-" * 50)
print(f"Global F1-Score:  {f1_score(y_true, y_pred):.2%}")
print(f"Global Precision: {precision_score(y_true, y_pred):.2%}")
print(f"Global Recall:    {recall_score(y_true, y_pred):.2%}")
print("="*50)


           REPORT DE MÉTRICAS BERT NER            
              precision    recall  f1-score   support

          ER       0.00      0.00      0.00         0
         ISC       0.00      0.00      0.00         0
         LOC       0.00      0.00      0.00       110
        MISC       0.00      0.00      0.00        27
          OC       0.00      0.00      0.00         0
         ORG       0.00      0.00      0.00       119
         PER       0.00      0.00      0.00       108

   micro avg       0.00      0.00      0.00       364
   macro avg       0.00      0.00      0.00       364
weighted avg       0.00      0.00      0.00       364

--------------------------------------------------
Global F1-Score:  0.00%
Global Precision: 0.00%
Global Recall:    0.00%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
