In [37]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.routers import TransformersZeroShotTextRouter
from haystack import Pipeline
from haystack.components.extractors import NamedEntityExtractor
from haystack import component, Document
from typing import Any, Dict, List, Union

from dotenv import load_dotenv
import os

load_dotenv(".env")
open_ai_key = os.getenv("OPENAI_API_KEY")
serper_api_key = os.getenv("SERPERDEV_API_KEY")


In [26]:
text_router = TransformersZeroShotTextRouter(
    model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
    labels=["Politics", "Sport","Technology","Entertainment", "Business"],)
text_router.warm_up()

@component
class NewsClassifier:
    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]) -> List[Document]:
        for document in documents:
            text = document.content
            meta = document.meta
            labels = text_router.run(text)
            meta['labels'] = list(labels.keys())[0]
            
        return {"documents": documents}

### Build Haystack pipeline 

In [32]:

# Initialize pipeline
pipeline = Pipeline()
web_search = SerperDevWebSearch(top_k=10,
                                allowed_domains=["https://ca.finance.yahoo.com/"])
link_content = LinkContentFetcher(retry_attempts=3,
                                  timeout=10)
html_to_doc = HTMLToDocument()
document_cleaner = DocumentCleaner(
                                remove_empty_lines=True,
                                remove_extra_whitespaces=True,
                                remove_repeated_substrings=False,
                                remove_substrings=['\n-']
                            )


# Add components
pipeline.add_component(name='search', instance=web_search)
pipeline.add_component(name ='fetcher' , instance= link_content)
pipeline.add_component(name='htmldocument', instance=html_to_doc)
pipeline.add_component(name='cleaner', instance=document_cleaner)
pipeline.add_component(name='classifier', instance=NewsClassifier())

# Connect components to one another
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "htmldocument")
pipeline.connect("htmldocument", "cleaner")
pipeline.connect("cleaner", "classifier")



<haystack.core.pipeline.pipeline.Pipeline object at 0x494af5cd0>
🚅 Components
  - search: SerperDevWebSearch
  - fetcher: LinkContentFetcher
  - htmldocument: HTMLToDocument
  - cleaner: DocumentCleaner
  - classifier: NewsClassifier
🛤️ Connections
  - search.links -> fetcher.urls (List[str])
  - fetcher.streams -> htmldocument.sources (List[ByteStream])
  - htmldocument.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> classifier.documents (List[Document])

### Use pipeline to search for all articles related to Elon Musk and extract entities

In [33]:
query = "Elon Musk"
output = pipeline.run(data={"search":{"query":query}})

In [34]:
extracted_documents = output['classifier']['documents']

In [None]:
for i in range(len(extracted_documents)):
    print(extracted_documents[i].meta)
    # to get content of the document use extracted_documents[i].content
    print("----")

{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/musk-ascends-political-force-beyond-204319247.html', 'labels': 'Politics'}
----
{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/elon-musk-spacex-face-federal-184331986.html', 'labels': 'Politics'}
----
{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/elon-musk-becomes-first-person-184648913.html', 'labels': 'Business'}
----
{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/elon-musks-net-worth-tops-143636708.html', 'labels': 'Business'}
----
{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/openai-fires-back-elon-musk-212307411.html', 'labels': 'Technology'}
----
{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/whining-elon-musk-squeals-sec-103752120.html', 'labels': 'Politics'}
----
{'content_type': 'text/html', 'url': 'https://ca.finance.yahoo.com/news/elon-musk-considers-funding-nigel-150306115.html'