In [13]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack.components.extractors import NamedEntityExtractor
from haystack import component, Document
from typing import Any, Dict, List, Union

from dotenv import load_dotenv
import os

load_dotenv(".env")
open_ai_key = os.getenv("OPENAI_API_KEY")
serper_api_key = os.getenv("SERPERDEV_API_KEY")


### Define custom component

In [None]:
@component
class NERPopulator():
    """This function extracts named entities from a list of
    documents and returns the result in a structured format.

    Args:
        documents (list): List of Haystack Document objects

    Returns:
        extracted_data (list): A list of dictionaries containing the extracted entities, 
        to make it Haystack-compatible we will return this list as a dictionary with the key 'documents'
    """
    
    @component.output_types(documents=List[Document])
    def run(self, sources: List[Document]) -> None:
        extracted_data = []

        for document in sources:
            content = document.content
            doc_id = document.id
            named_entities = document.meta.get('named_entities', [])
            url = document.meta.get('url', 'N/A')  # Default to 'N/A' if URL is not available

            # Sets to store unique entities by type
            entities_by_type = {
                "LOC": set(),
                "PER": set(),
                "ORG": set(),
                "MISC": set()
            }
            
            # Loop through the entities and filter by score and type
            for entity in named_entities:
                if float(entity.score) < 0.8:
                    continue
                
                word = content[entity.start:entity.end]
                if entity.entity in entities_by_type:
                    entities_by_type[entity.entity].add(word)  # Use set to ensure uniqueness
            
            # Prepare the meta field with comma-separated values
            meta = {
                "LOC": ",".join(entities_by_type["LOC"]),
                "PER": ",".join(entities_by_type["PER"]),
                "ORG": ",".join(entities_by_type["ORG"]),
                "MISC": ",".join(entities_by_type["MISC"]),
                "url": url
            }
            
            # Append the result for this document
            extracted_data.append({
                'document_id': doc_id,
                'content': content,
                'meta': meta
            })
        

        return {"documents": extracted_data}


### Build Haystack pipeline with custom component

In [75]:

# Initialize pipeline
pipeline = Pipeline()
web_search = SerperDevWebSearch(top_k=10,
                                allowed_domains=["https://ca.finance.yahoo.com/"])
link_content = LinkContentFetcher(retry_attempts=3,
                                  timeout=10)
html_to_doc = HTMLToDocument()
document_cleaner = DocumentCleaner(
                                remove_empty_lines=True,
                                remove_extra_whitespaces=True,
                                remove_repeated_substrings=False,
                                remove_substrings=['\n-']
                            )
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()

ner_component = NERPopulator()

# Add components
pipeline.add_component(name='search', instance=web_search)
pipeline.add_component(name ='fetcher' , instance= link_content)
pipeline.add_component(name='htmldocument', instance=html_to_doc)
pipeline.add_component(name='cleaner', instance=document_cleaner)
pipeline.add_component(name='extractor', instance=extractor)
pipeline.add_component(name='ner', instance=ner_component)

# Connect components to one another
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "htmldocument")
pipeline.connect("htmldocument", "cleaner")
pipeline.connect("cleaner", "extractor")
pipeline.connect("extractor", "ner")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<haystack.core.pipeline.pipeline.Pipeline object at 0x3057230b0>
🚅 Components
  - search: SerperDevWebSearch
  - fetcher: LinkContentFetcher
  - htmldocument: HTMLToDocument
  - cleaner: DocumentCleaner
  - extractor: NamedEntityExtractor
  - ner: NERPopulator
🛤️ Connections
  - search.links -> fetcher.urls (List[str])
  - fetcher.streams -> htmldocument.sources (List[ByteStream])
  - htmldocument.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> extractor.documents (List[Document])
  - extractor.documents -> ner.sources (List[Document])

### Use pipeline to search for all articles related to Elon Musk and extract entities

In [76]:
query = "Elon Musk"
output = pipeline.run(data={"search":{"query":query}})

In [77]:
extracted_documents = output['ner']['documents']

### Save document as DataFrame object

In [78]:
import pandas as pd
df = pd.DataFrame(extracted_documents)

df

Unnamed: 0,document_id,content,meta
0,5fcf252d1a589288db70c6252d42b040bbbe8b1647211a...,Musk ascends as a political force beyond his w...,"{'LOC': 'California,Washington,America', 'PER'..."
1,f08ab77c1b78b84b150b74d800e6fe8e03e1ef792a3402...,WASHINGTON (Reuters) -Elon Musk and SpaceX fac...,"{'LOC': 'Washington,U.S.,Israel', 'PER': 'Joe ..."
2,830b24f764cb2af6949f3a9ff77c9c310faefcb923174e...,In This Article:\nThe richest person in the wo...,"{'LOC': '', 'PER': 'Trump,V,k Ramaswamy,Jeff,i..."
3,02d1bb7493336996c815d9b5c5feb154385410e5ebaf95...,(Reuters) - Tesla CEO Elon Musk's net worth cr...,"{'LOC': '', 'PER': 'Sophia,Chandra,Jeff Be,on ..."
4,a2f4a4d3ed4d36701eab9c8fbf5da39a274ce1a98608ea...,LONDON (AP) — It’s a photo that sent a tremor ...,"{'LOC': 'Florida,U.K.,Mar-a-Lago,K,U,Britain,L..."
5,21f41abc36296146fb7652a6a558cbe699d20bc092a318...,By Mike Scarcella\n(Reuters) -OpenAI asked a f...,"{'LOC': 'California,Oakland,U.S', 'PER': 'on M..."
6,75a09a359ac2d1c5d1ab1fe884a5e332b27fe33cbc57cb...,Elon Musk has demanded to know if President Jo...,"{'LOC': 'White House', 'PER': 'Spiro,Gensler,T..."
7,6e0cad97242cda6c3c99eea3d22ee378efe8e1efdad597...,A 7-year-old rivalry between tech leaders Elon...,"{'LOC': 'California,Oakland,U.S', 'PER': 'Sam ..."
8,d5de664e379522c362aead76d58cfe8991bf57737c5702...,"In This Article:\nBy Tom Hals\nWILMINGTON, Del...","{'LOC': 'U.S.,Delaware', 'PER': 'Mu,Kathale,en..."
9,1eeb07b5eb740d1717c5b1e86606c1eb1bdca9867701b5...,"MUSCAT, Oman, Dec. 19, 2024 /CNW/ -- Oman Inve...","{'LOC': 'Oman,United States', 'PER': 'hidi,Abd..."


In [79]:
extracted_documents

[{'document_id': '5fcf252d1a589288db70c6252d42b040bbbe8b1647211a465891d3e773257f80',
  'meta': {'LOC': 'California,Washington,America',
   'PER': 'Mike Johnson,Thomas Beaumont,Pack,on Musk,Trump,Chris Pack,Johnson,Robert Garcia,Mu,Donald Trump',
   'ORG': 'X,House,Tesla and Space X,PA,National Republican Congressional Committee,Senate Leadership Fund',
   'MISC': 'Rep,Democrats,Deal,Democratic,GOP,Republicans',
   'url': 'https://ca.finance.yahoo.com/news/musk-ascends-political-force-beyond-204319247.html'}},
 {'document_id': 'f08ab77c1b78b84b150b74d800e6fe8e03e1ef792a3402c233c59a435e1e31a4',
  'content': 'WASHINGTON (Reuters) -Elon Musk and SpaceX face at least three federal reviews over whether they have complied with federal reporting rules aimed at protecting national security, the New York Times reported on Tuesday.\nThis is not the first time the security practices of the SpaceX founder have been questioned by the Pentagon.\nA 2018 incident in which Musk, who is also CEO of Tesla