In [None]:
from haystack.components.extractors import NamedEntityExtractor
import pandas as pd
from haystack.dataclasses import Document

In [None]:
def extract_named_entities_with_idsl(documents):
    """This function extracts named entities from a list of
    documents and returns the result in a structured format.

    Args:
        documents (list): List of Haystack Document objects

    Returns:
        extracted_data (list): A list of dictionaries containing the extracted entities
    """
    extracted_data = []

    for document in documents:
        content = document.content
        doc_id = document.id
        named_entities = document.meta.get('named_entities', [])
        
        # Sets to store unique entities by type
        entities_by_type = {
            "LOC": set(),
            "PER": set(),
            "ORG": set()
        }
        
        # Loop through the entities and filter by score and type
        for entity in named_entities:
            if float(entity.score) < 0.8 or entity.entity == "MISC":
                continue
            
            word = content[entity.start:entity.end]
            if entity.entity in entities_by_type:
                entities_by_type[entity.entity].add(word)  # Use set to ensure uniqueness
        
        # Prepare the meta field with comma-separated values
        meta = {
            "LOC": ",".join(entities_by_type["LOC"]),
            "PER": ",".join(entities_by_type["PER"]),
            "ORG": ",".join(entities_by_type["ORG"])
        }
        
        # Append the result for this document
        extracted_data.append({
            'document_id': doc_id,
            'content': content,
            'meta': meta
        })
    

    return extracted_data

### Initialize the Named Entity Extractor


In [16]:
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df = pd.read_csv("df_file.csv")
sample_docs = df['Text'].to_list()
documents = [Document(id=str(i), content=sample_docs[i]) for i in range(len(sample_docs))]

# Apply extractor to the documents
extractor.run(documents)


In [None]:
# Extract named entities from the documents
extracted_documents = extract_named_entities_with_idsl(documents)
df = pd.DataFrame(extracted_documents)
df.to_csv("ner_output.csv", index=False)