In [6]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack import Pipeline
from haystack.components.extractors import NamedEntityExtractor


from dotenv import load_dotenv
import os

load_dotenv(".env")
open_ai_key = os.getenv("OPENAI_API_KEY")
serper_api_key = os.getenv("SERPERDEV_API_KEY")

# Initialize pipeline
pipeline = Pipeline()
web_search = SerperDevWebSearch(top_k=5,
                                allowed_domains=["https://www.britannica.com/"])
link_content = LinkContentFetcher(retry_attempts=3,
                                  timeout=10)
html_to_doc = HTMLToDocument()
document_cleaner = DocumentCleaner(
                                remove_empty_lines=True,
                                remove_extra_whitespaces=True,
                                remove_repeated_substrings=False,
                                remove_substrings=['\n-']
                            )
extractor = NamedEntityExtractor(backend="hugging_face", model="dslim/bert-base-NER")
extractor.warm_up()

# Add components
pipeline.add_component(name='search', instance=web_search)
pipeline.add_component(name ='fetcher' , instance= link_content)
pipeline.add_component(name='htmldocument', instance=html_to_doc)
pipeline.add_component(name='cleaner', instance=document_cleaner)
pipeline.add_component(name='extractor', instance=extractor)

# Connect components to one another
pipeline.connect("search.links", "fetcher.urls")
pipeline.connect("fetcher", "htmldocument")
pipeline.connect("htmldocument", "cleaner")
pipeline.connect("cleaner", "extractor")


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<haystack.core.pipeline.pipeline.Pipeline object at 0x30d70ef60>
🚅 Components
  - search: SerperDevWebSearch
  - fetcher: LinkContentFetcher
  - htmldocument: HTMLToDocument
  - cleaner: DocumentCleaner
  - extractor: NamedEntityExtractor
🛤️ Connections
  - search.links -> fetcher.urls (List[str])
  - fetcher.streams -> htmldocument.sources (List[ByteStream])
  - htmldocument.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> extractor.documents (List[Document])

In [7]:
query = "What can you tell me about the year of the dragon?"
output = pipeline.run(data={"search":{"query":query}})

In [8]:
extracted_documents = output['extractor']['documents']

In [None]:
import pandas as pd

# Function to extract uniquely identified named entities into a DataFrame with URL
def extract_named_entities_with_ids_and_url(documents):
    extracted_data = []
    for document in documents:
        content = document.content
        doc_id = document.id
        url = document.meta.get('url', 'N/A')  # Default to 'N/A' if URL is not available
        named_entities = document.meta.get('named_entities', [])
        for entity in named_entities:
            word = content[entity.start:entity.end]
            extracted_data.append({
                'document_id': doc_id,
                'word': word,
                'entity_type': entity.entity,
                'score': float(entity.score),
                'url': url
            })
    
    # Convert to pandas DataFrame
    df = pd.DataFrame(extracted_data)
    return df

# Extract and display named entities with unique IDs and URLs
df_entities = extract_named_entities_with_ids_and_url(extracted_documents)
df_entities.drop_duplicates(subset=['word', 'entity_type','score'], inplace=True)

In [45]:
df_entities.head()

Unnamed: 0,document_id,word,entity_type,score,url
0,eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...,Chinese New Year,MISC,0.870644,https://www.britannica.com/topic/Chinese-New-Year
1,eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...,Lunar New Year,MISC,0.915542,https://www.britannica.com/topic/Chinese-New-Year
2,eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...,China,LOC,0.918623,https://www.britannica.com/topic/Chinese-New-Year
3,eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...,Chinese,MISC,0.664121,https://www.britannica.com/topic/Chinese-New-Year
4,eed2cf1f3e92c540b2842f908e54ae572448a45ff5b200...,Lunar New Year,MISC,0.868886,https://www.britannica.com/topic/Chinese-New-Year


In [46]:
df_entities.describe()

Unnamed: 0,score
count,98.0
mean,0.898904
std,0.140579
min,0.374084
25%,0.870194
50%,0.961658
75%,0.999651
max,0.999793


In [44]:
df_entities[df_entities['score']>0.9].groupby("entity_type").count()

Unnamed: 0_level_0,document_id,word,score,url
entity_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LOC,5,5,5,5
MISC,55,55,55,55
ORG,6,6,6,6
PER,4,4,4,4
