In [6]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack.components.extractors import NamedEntityExtractor


from dotenv import load_dotenv
import os

load_dotenv(".env")
open_ai_key = os.getenv("OPENAI_API_KEY")
serper_api_key = os.getenv("SERPERDEV_API_KEY")

# Initialize pipeline
pipeline = Pipeline()
web_search = SerperDevWebSearch(top_k=5,
                                allowed_domains=["https://www.britannica.com/"])
link_content = LinkContentFetcher(retry_attempts=3,
                                  timeout=10)
html_to_doc = HTMLToDocument()
document_cleaner = DocumentCleaner(
                                remove_empty_lines=True,
                                remove_extra_whitespaces=True,
                                remove_repeated_substrings=False,
                                remove_substrings=['\n-']
                            )
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()

# Add components
pipeline.add_component(name='search', instance=web_search)
pipeline.add_component(name ='fetcher' , instance= link_content)
pipeline.add_component(name='htmldocument', instance=html_to_doc)
pipeline.add_component(name='cleaner', instance=document_cleaner)
pipeline.add_component(name='extractor', instance=extractor)

# Connect components to one another
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "htmldocument")
pipeline.connect("htmldocument", "cleaner")
pipeline.connect("cleaner", "extractor")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<haystack.core.pipeline.pipeline.Pipeline object at 0x30d70ef60>
🚅 Components
  - search: SerperDevWebSearch
  - fetcher: LinkContentFetcher
  - htmldocument: HTMLToDocument
  - cleaner: DocumentCleaner
  - extractor: NamedEntityExtractor
🛤️ Connections
  - search.links -> fetcher.urls (List[str])
  - fetcher.streams -> htmldocument.sources (List[ByteStream])
  - htmldocument.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> extractor.documents (List[Document])

In [7]:
query = "What can you tell me about the year of the dragon?"
output = pipeline.run(data={"search":{"query":query}})

In [8]:
extracted_documents = output['extractor']['documents']

In [9]:
extracted_documents[0].meta

{'content_type': 'text/html',
 'url': 'https://www.britannica.com/topic/Chinese-New-Year',
 'named_entities': [NamedEntityAnnotation(entity='MISC', start=0, end=16, score=np.float32(0.8706437)),
  NamedEntityAnnotation(entity='MISC', start=30, end=44, score=np.float32(0.91554195)),
  NamedEntityAnnotation(entity='LOC', start=61, end=66, score=np.float32(0.9186233)),
  NamedEntityAnnotation(entity='MISC', start=67, end=74, score=np.float32(0.66412055)),
  NamedEntityAnnotation(entity='MISC', start=82, end=96, score=np.float32(0.8688859)),
  NamedEntityAnnotation(entity='MISC', start=121, end=137, score=np.float32(0.9623936)),
  NamedEntityAnnotation(entity='LOC', start=165, end=170, score=np.float32(0.99973375)),
  NamedEntityAnnotation(entity='MISC', start=175, end=182, score=np.float32(0.99974746)),
  NamedEntityAnnotation(entity='MISC', start=311, end=318, score=np.float32(0.9997546)),
  NamedEntityAnnotation(entity='MISC', start=414, end=428, score=np.float32(0.9671516)),
  NamedEnt

In [10]:
extracted_documents[0].content

'Chinese New Year Also called: Lunar New Year Related Topics: China Chinese zodiac Lunar New Year January February\nNews •\nChinese New Year, annual 15-day festival in China and Chinese communities around the world that begins with the new moon that occurs sometime between January 21 and February 20 according to Western calendars. Festivities last until the following full moon.\nThe holiday is sometimes called the Lunar New Year because the dates of celebration follow the phases of the moon. Since the mid-1990s people in China have been given seven consecutive days off work during the Chinese New Year. This week of relaxation has been designated Spring Festival, a term that is sometimes used to refer to the Chinese New Year in general.\nThe origins of the Chinese New Year are steeped in legend. One legend is that thousands of years ago a monster named Nian (“Year”) would attack villagers at the beginning of each new year. The monster was afraid of loud noises, bright lights, and the co