In [None]:
import spacy
from spacy.pipeline import EntityRecognizer
from spacy.tokens import Doc, Span
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Load your saved model
model_name = "../bert-large-mp-local"  # Directory where your model is saved
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create an NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")




In [9]:
# Function to process text with Hugging Face NER model
def hf_ner_pipe(doc):
    text = doc.text
    entities = ner_pipeline(text)

    ents = []
    for ent in entities:
        start_char, end_char, label = ent["start"], ent["end"], ent["entity_group"]

        # Use spaCy's built-in char_span() to avoid token index errors
        span = doc.char_span(start_char, end_char, label=label)
        if span is not None:  # Ensure span is valid
            ents.append(span)

    doc.ents = ents  # Assign extracted entities to the doc
    return doc

# Create a blank spaCy pipeline
nlp = spacy.blank("en")

# Add custom NER component
nlp.add_pipe(hf_ner_pipe, name="hf_ner", first=True)

# Test it
text = "Lorem ipsum is a dummy or placeholder text commonly used in graphic design, publishing, and web development to fill empty spaces in a layout that does not yet have content.."
doc = nlp(text)

# Print extracted entities
for ent in doc.ents:
    print(ent.text, ent.label_)

# Visualize with displaCy
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)


Lorem ipsum is a dummy or placeholder text commonly used in graphic design, publishing, and web development to fill empty spaces in a layout that does not yet have content.. LABEL_1
