In [13]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack.components.extractors import NamedEntityExtractor
from haystack import component, Document
from typing import Any, Dict, List, Union

from dotenv import load_dotenv
import os

load_dotenv(".env")
open_ai_key = os.getenv("OPENAI_API_KEY")
serper_api_key = os.getenv("SERPERDEV_API_KEY")


### Define custom component

In [None]:
@component
class NERPopulator():
    """This function extracts named entities from a list of
    documents and returns the result in a structured format.

    Args:
        documents (list): List of Haystack Document objects

    Returns:
        extracted_data (list): A list of dictionaries containing the extracted entities, 
        to make it Haystack-compatible we will return this list as a dictionary with the key 'documents'
    """
    
    @component.output_types(documents=List[Document])
    def run(self, sources: List[Document]) -> None:
        extracted_data = []

        for document in sources:
            content = document.content
            doc_id = document.id
            named_entities = document.meta.get('named_entities', [])
            url = document.meta.get('url', 'N/A')  # Default to 'N/A' if URL is not available

            # Sets to store unique entities by type
            entities_by_type = {
                "LOC": set(),
                "PER": set(),
                "ORG": set(),
                "MISC": set()
            }
            
            # Loop through the entities and filter by score and type
            for entity in named_entities:
                if float(entity.score) < 0.8:
                    continue
                
                word = content[entity.start:entity.end]
                if entity.entity in entities_by_type:
                    entities_by_type[entity.entity].add(word)  # Use set to ensure uniqueness
            
            # Prepare the meta field with comma-separated values
            meta = {
                "LOC": ",".join(entities_by_type["LOC"]),
                "PER": ",".join(entities_by_type["PER"]),
                "ORG": ",".join(entities_by_type["ORG"]),
                "MISC": ",".join(entities_by_type["MISC"]),
                "url": url
            }
            
            # Append the result for this document
            extracted_data.append({
                'document_id': doc_id,
                'content': content,
                'meta': meta
            })
        

        return {"documents": extracted_data}


### Build Haystack pipeline with custom component

In [43]:

# Initialize pipeline
pipeline = Pipeline()
web_search = SerperDevWebSearch(top_k=5,
                                allowed_domains=["https://www.britannica.com/"])
link_content = LinkContentFetcher(retry_attempts=3,
                                  timeout=10)
html_to_doc = HTMLToDocument()
document_cleaner = DocumentCleaner(
                                remove_empty_lines=True,
                                remove_extra_whitespaces=True,
                                remove_repeated_substrings=False,
                                remove_substrings=['\n-']
                            )
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()

ner_component = NERPopulator()

# Add components
pipeline.add_component(name='search', instance=web_search)
pipeline.add_component(name ='fetcher' , instance= link_content)
pipeline.add_component(name='htmldocument', instance=html_to_doc)
pipeline.add_component(name='cleaner', instance=document_cleaner)
pipeline.add_component(name='extractor', instance=extractor)
pipeline.add_component(name='ner', instance=ner_component)

# Connect components to one another
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "htmldocument")
pipeline.connect("htmldocument", "cleaner")
pipeline.connect("cleaner", "extractor")
pipeline.connect("extractor", "ner")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<haystack.core.pipeline.pipeline.Pipeline object at 0x33a2bb110>
🚅 Components
  - search: SerperDevWebSearch
  - fetcher: LinkContentFetcher
  - htmldocument: HTMLToDocument
  - cleaner: DocumentCleaner
  - extractor: NamedEntityExtractor
  - ner: NERPopulator
🛤️ Connections
  - search.links -> fetcher.urls (List[str])
  - fetcher.streams -> htmldocument.sources (List[ByteStream])
  - htmldocument.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> extractor.documents (List[Document])
  - extractor.documents -> ner.sources (List[Document])

### Use pipeline to search Encyclopedia Britannica for all articles related to Elon Musk and extract entities

In [44]:
query = "Elon Musk"
output = pipeline.run(data={"search":{"query":query}})

In [45]:
extracted_documents = output['ner']['documents']

### Save document as DataFrame object

In [49]:
import pandas as pd
df = pd.DataFrame(extracted_documents)

df

Unnamed: 0,document_id,content,meta
0,98b3d89f61ff370a43e5ba5c19c1de3522851ded6feed6...,Elon Musk cofounded the electronic payment fir...,"{'LOC': '', 'PER': '', 'ORG': 'X,Neuralink,Pay..."
1,1db89fda1c1c46e3c66e85aefd8e4d29c8c10d3e3c9e11...,SpaceX In full: Space Exploration Technologies...,"{'LOC': 'Hawthorne,Earth,U.S.,California', 'PE..."
2,bf226b982f01f238d32d66213fb8628104153b23b832bf...,"SpaceX, in full Space Exploration Technologies...","{'LOC': 'Hawthorne,Earth,U.S.,California', 'PE..."
3,ae773eb9d70c2c5a7a244895bec15745cefdeef14ef3ff...,Falcon Heavy\nLearn about this topic in these ...,"{'LOC': 'U.S,United States', 'PER': '', 'ORG':..."
4,8532677167dc67f0e2c6ba7ed206c89cda5c6b3dde3ea1...,Falcon Related Topics: launch vehicle Falcon H...,"{'LOC': 'Florida,Kwajalein Atoll,S,Cape Canave..."
