In [3]:
import json
from typing import Any, Dict, List, Optional

from openai import OpenAI
from openai.types.chat import ChatCompletion, ChatCompletionMessage
from pydantic import BaseModel, Field, ValidationError
from haystack.dataclasses import ChatMessage, StreamingChunk

from haystack import  Document, Pipeline, component
from haystack.utils import Secret
from prompts import ANTHROPIC_DEFAULT_PROMPT
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [12]:
from openai import BadRequestError

def _convert_message_to_openai_format(message: ChatMessage) -> Dict[str, Any]:
    """Converts a ChatMessage to the format expected by OpenAI's API:
    
    Given a ChatMessage object, we extract its role (e.g., 'system', 'user', 'assistant')
    and content, and return a dictionary that matches the OpenAI API format.
    """
    return {"role": message.role, "content": message.content}


@component
class BaseOpenAIGenerator(object):
    """
    A base component that uses OpenAI's models to generate text completions.

    This component is model-agnostic and does not assume a chat-based model. If you need
    the conversational structure, it’s better to use the `OpenAIChatGenerator`.
    """

    def __init__(
        self,
        api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"),
        model: str = "gpt-4o",
        generation_kwargs: Optional[Dict[str, Any]] = None,
        system_prompt: Optional[str] = None,
    ):
        """
        :param api_key: The OpenAI API key to authenticate requests.
        :param model: Name of the OpenAI model to use (e.g., 'gpt-4o').
        :param generation_kwargs: Additional parameters passed to the OpenAI completion
            endpoint. For example, you can set `max_tokens` or `temperature` here.
        :param system_prompt: If using a chat model, this is the system-level prompt that
            guides the overall behavior and style of the responses before any user input.
        """
        self.api_key = api_key
        self.model = model
        self.generation_kwargs = generation_kwargs or {}
        self.system_prompt = system_prompt
        self.client = OpenAI(api_key=self.api_key.resolve_value())

    def _check_finish_reason(self, response: ChatMessage):
        """Check the finish reason returned by the OpenAI API.
        
        If the response finished because it hit the max_tokens limit ('length'), raise an error 
        to inform the user to adjust `max_tokens` or shorten the input.
        """
        if response.meta.get("finish_reason") == "length":
            raise ValueError(
                f"The completion ended due to the 'length' of the response. Consider increasing max_tokens."
            )

    @component.output_types(replies=List[str], meta=List[Dict[str, Any]])
    def run(
        self,
        prompt: str,
        generation_kwargs: Optional[Dict[str, Any]] = None,
        messages: Optional[List[ChatMessage]] = None,
    ):
        """Run the OpenAI generation call.
        
        This method:
        1. Merges provided generation_kwargs with the ones from initialization.
        2. Prepares messages in the OpenAI-specific format.
        3. Calls the OpenAI chat/completions endpoint.
        4. Returns structured responses.
        """
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
        logging.debug(f"OpenAIGenerator - Running with kwargs: {generation_kwargs}, messages: {messages}")

        # If messages are provided, convert them to the OpenAI format directly.
        if messages:
           openai_formatted_messages = [_convert_message_to_openai_format(msg) for msg in messages]
        else:
            # If no messages, create them from the prompt and optionally prepend a system prompt.
            message = ChatMessage.from_user(prompt)
            if self.system_prompt:
                messages = [ChatMessage.from_system(self.system_prompt), message]
            else:
                messages = [message]
            openai_formatted_messages = [_convert_message_to_openai_format(msg) for msg in messages]

        # Make the OpenAI API call
        completion: ChatCompletion = self.client.chat.completions.create(
            model=self.model,
            messages=openai_formatted_messages,
            **generation_kwargs
        )
        logging.debug(f"OpenAIGenerator - OpenAI API response: {completion}")

        # Build the structured message(s) from the OpenAI response
        completions = [self._build_structured_message(completion, choice) for choice in completion.choices]
        
        # Check if any response hit the 'length' finish_reason
        for response in completions:
            self._check_finish_reason(response)

        # Return the content and metadata of the responses
        return {
            "replies": [message.content for message in completions],
            "meta": [message.meta for message in completions],
        }

    def _build_structured_message(self, completion: Any, choice: Any) -> ChatMessage:
        """Build a structured ChatMessage from a single OpenAI API completion choice.
        
        Extracts the text from the assistant message and attaches metadata such as:
        - model used
        - index of the choice
        - finish reason
        - usage (tokens, etc.)
        """
        chat_message = ChatMessage.from_assistant(choice.message.content or "")
        chat_message.meta.update(
            {
                "model": completion.model,
                "index": choice.index,
                "finish_reason": choice.finish_reason,
                "usage": dict(completion.usage),
            }
        )
        return chat_message


class Metadata(BaseModel):
    # A Pydantic model describing the expected structured metadata.
    # This model is used to validate the JSON that the OpenAI model returns.
    PrimaryQuestion: str = Field(description="De belangrijkste vraag die dit item beantwoordt.")
    PrimaryTheme: str = Field(description="Het hoofdthema waar dit item bij hoort.")
    SecondaryThemes: List[str] = Field(description="Specifieke subthema's die het hoofdthema verfijnen.")
    Entities: List[str] = Field(description="Belangrijke personen, plaatsen of objecten die aan het item zijn gekoppeld.")
    EntityRelationships: List[str] = Field(description="Beschrijvingen van relaties tussen entiteiten.")
    TimePeriod: str = Field(description="De historische periode of datum die met het item wordt geassocieerd.")
    Location: str = Field(description="De geografische context die bij het item hoort.")
    AssetType: str = Field(description="Het type item (bijv. Brief, Foto, Kaart).")
    StorylineDimension: str = Field(description="Dominante narratieve structuur (Chronologisch, Entiteitgericht, Emotiegedreven).")
    NarrativeFocus: str = Field(description="Hoe het item bijdraagt aan het verhaal of narratief.")
    Keywords: List[str] = Field(description="Extra zoekwoorden of tags voor betere zoekresultaten.")
    ExplorationTags: List[str] = Field(description="Gerelateerde thema's of onderwerpen voor verdere verkenning.")
    FollowUpQuestionTags: List[str] = Field(description="Tags voor het genereren van vervolgvragen.")
    Summary: Optional[str] = Field(description="Een korte samenvatting van de volledige tekst.", default=None)
    Sender: Optional[str] = Field(description="Naam van de afzender.", default=None)
    Recipient: Optional[str] = Field(description="Naam van de ontvanger.", default=None)
    DateSent: Optional[str] = Field(description="Datum waarop de brief is verstuurd.", default=None)
    LetterType: Optional[str] = Field(description="Type brief (bijv. Persoonlijk, Officieel).", default=None)
    ContentSummary: Optional[str] = Field(description="Korte samenvatting van de inhoud van de brief.", default=None)
    Scale: Optional[str] = Field(description="Schaal van de kaart (bijv. 1:5000).", default=None)
    MapFeatures: Optional[str] = Field(description="Opvallende kenmerken, zoals gebouwen, grenzen, rivieren.", default=None)
    DateCreated: Optional[str] = Field(description="Datum waarop de kaart is gemaakt.", default=None)
    LocationCovered: Optional[str] = Field(description="Gebieden of plaatsen die op de kaart worden weergegeven.", default=None)
    Photographer: Optional[str] = Field(description="Naam van de fotograaf.", default=None)
    DateTaken: Optional[str] = Field(description="Datum waarop de foto is genomen.", default=None)
    Event: Optional[str] = Field(description="Gebeurtenis die op de foto is vastgelegd.", default=None)
    PeopleInPhoto: Optional[str] = Field(description="Namen van personen op de foto.", default=None)
    ArticleTopic: Optional[str] = Field(description="Onderwerp of thema van het artikel.", default=None)
    Author: Optional[str] = Field(description="Naam van de auteur.", default=None)
    PublicationDate: Optional[str] = Field(description="Datum van publicatie.", default=None)
    Source: Optional[str] = Field(description="Naam van het tijdschrift of de bron.", default=None)


@component
class OpenAIGenerator(BaseOpenAIGenerator):
    @component.output_types(replies=List[str], meta=List[Dict[str, Any]], structured_reply=BaseModel)
    def run(
        self,
        prompt: str,
        generation_kwargs: Optional[Dict[str, Any]] = None,
        messages: Optional[List[ChatMessage]] = None,
    ):
        """An extension of BaseOpenAIGenerator that optionally returns a structured reply.
        
        If a 'response_format' is specified in generation_kwargs, it indicates we expect a JSON 
        structured output. Otherwise, it falls back to BaseOpenAIGenerator's behavior.
        """
        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
        
        # If user tries to pass an image parameter, it's not supported here.
        if "image" in generation_kwargs.keys():
            raise ValueError("The 'image' parameter is not supported by the OpenAIGenerator component")

        # If a response_format is given, we assume we want structured JSON output.
        if "response_format" in generation_kwargs.keys():
            system_prompt = ChatMessage.from_system(ANTHROPIC_DEFAULT_PROMPT)
            message = ChatMessage.from_user(prompt)
            messages = [system_prompt, message]
            openai_formatted_messages = [_convert_message_to_openai_format(msg) for msg in messages]
            try:
                completion: ChatCompletion = self.client.chat.completions.create(
                    model=self.model,
                    messages=openai_formatted_messages,
                    **generation_kwargs
                )
            except BadRequestError as e:
                logging.error(f"OpenAIGenerator - BadRequestError: {e} \n\n for prompt {prompt}")
                return {"replies": [], "meta": [], "structured_reply": {}}
           
            completions = [self._build_structured_message(completion, choice) for choice in completion.choices]
            for response in completions:
                self._check_finish_reason(response)
            
            # The first structured reply is returned here as a single object.
            return {
                "replies": [message.content for message in completions],
                "meta": [message.meta for message in completions],
                "structured_reply": completions[0].content
            }
        else:
            # Otherwise, just run the base logic.
            return super().run(prompt, generation_kwargs, messages)

    def _build_structured_message(self, completion: Any, choice: Any) -> ChatMessage:
        """Build a structured message from the completion, as above, attaching metadata."""
        chat_message = ChatMessage.from_assistant(choice.message.content or "")
        chat_message.meta.update(
            {
                "model": completion.model,
                "index": choice.index,
                "finish_reason": choice.finish_reason,
                "usage": dict(completion.usage),
            }
        )
        return chat_message


@component
class MetadataEnricher:
    """A component that processes a list of Documents and enriches their metadata.

    This component:
    1. Classifies each document as an image or text based on 'VorT' field in metadata.
    2. Prepares a suitable prompt and messages for OpenAI.
    3. Calls OpenAI (using OpenAIGenerator or directly) to get a structured JSON response.
    4. Validates and merges the returned JSON into the document's metadata.
    """

    def __init__(self, metadata_model: BaseModel, prompt: str = ANTHROPIC_DEFAULT_PROMPT):
        """
        :param metadata_model: A Pydantic model (e.g., Metadata) that defines expected fields in the response.
        :param prompt: The system prompt to guide the OpenAI model on how to structure or transform metadata.
        """
        self.metadata_model = metadata_model
        self.metadata_prompt = prompt
        self.pipeline = Pipeline()
        

    def _create_openai_message(self, document: Document, master_prompt: str):
        """Construct OpenAI messages depending on the document type (image or text).
        
        - If the document is visual (e.g., photo), prepare a message including an image URL.
        - If the document is textual, prepare a message including the textual content.
        
        Returns:
            messages (List[ChatMessage]): The constructed messages for the OpenAI call.
            doc_type (str): "image" or "text" to indicate how we process the doc.
        """
        meta = document.meta
        url = meta.get("link", "")
        beschrijving = meta.get("beschrijving", "")
        title = meta.get("Title", "")
        extra_info = meta.get("Context en beschrijving (aan elkaar geplakt)", "")
        extra_info += meta.get("Extra info (informatie\n uit velden)", "")
        metadata_for_prompt = f"URL: {url}\nTitel: {title}\nBeschrijving: {beschrijving}, andere data {extra_info}\n"
        logging.debug(f"MetadataEnricher - Metadata for prompt: {metadata_for_prompt}")

        # Check if the doc is visual or textual
        if meta.get("VorT") == "visueel":
            image_url = meta.get("representatieve\nafbeelding")
            if not image_url:
                logging.warning(f"MetadataEnricher - No image URL found for document: {document.meta.get('ID')}")
                return None, None
            
            # Prepare messages that ask about the image content
            messages = [
                ChatMessage.from_system(master_prompt),
                ChatMessage.from_user(metadata_for_prompt),
                ChatMessage.from_user(
                 [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        }
                    },
                 ]
                )
            ]
            logging.debug(f"MetadataEnricher - Created image messages for document: {document.meta.get('ID')}")
            return messages, "image"

        else:
            # If textual, we send the text content along with the metadata
            content = document.content
            messages = [
                ChatMessage.from_system(master_prompt),
                ChatMessage.from_user(metadata_for_prompt),
                ChatMessage.from_user(f"Inhoud van het item: {content}")
            ]
            logging.debug(f"MetadataEnricher - Created text messages for document: {document.meta.get('ID')}")
            return messages, "text"


    def _process_document(self, document: Document):
        """Processes a single document and enriches it using the OpenAI API.
        
        Steps:
        1. Create the message prompt set for the document.
        2. If the document is an image, call OpenAI directly with the image URL.
        3. If the document is text, use the OpenAIGenerator pipeline.
        4. Parse the returned JSON and update the document's metadata.
        """
        logging.debug(f"MetadataEnricher - Processing document: {document.meta.get('ID')}")
        messages, doc_type = self._create_openai_message(document, self.metadata_prompt)
        if messages is None:
            logging.warning(f"MetadataEnricher - No messages created for document: {document.meta.get('ID')}")
            return document

        llm_kwargs = {"response_format": {"type": "json_object"}}

        # If image: directly call the OpenAI API client
        if doc_type == "image":
            llm_kwargs["max_tokens"] = 1500
            logging.debug(f"MetadataEnricher - Processing as image, llm_kwargs: {llm_kwargs}")
            
            openai_formatted_messages = [_convert_message_to_openai_format(msg) for msg in messages]
            try:
                completion: ChatCompletion = OpenAI(api_key=Secret.from_env_var("OPENAI_API_KEY").resolve_value()).chat.completions.create(
                    model="gpt-4o",
                    messages=openai_formatted_messages,
                    **llm_kwargs
                )
            except BadRequestError as e:
                logging.error(f"MetadataEnricher - BadRequestError: {e} for document {document.meta.get('ID')}")
                return document

            # Parse JSON output from completion
            if completion.choices:
                reply = completion.choices[0].message.content
                logging.debug(f"MetadataEnricher - Raw LLM reply for image document: {reply}")
                try:
                    metadata = json.loads(reply)
                    logger.info("-------------------------------------------------------")
                    logger.info(metadata)
                    document.meta.update(metadata)
                    logging.debug(f"MetadataEnricher - Successfully updated metadata for image document: {document.meta.get('ID')}")
                except json.JSONDecodeError as e:
                    logging.error(f"MetadataEnricher - JSONDecodeError: {e} for document {document.meta.get('ID')}")
                except ValidationError as e:
                    logging.error(f"MetadataEnricher - Pydantic validation error: {e} for document {document.meta.get('ID')}")
                except Exception as e:
                    logging.error(f"MetadataEnricher - Unexpected error: {e} for document {document.meta.get('ID')}")
                return document
            logging.warning(f"MetadataEnricher - No completion choices for image document: {document.meta.get('ID')}")
            return document
        
        else:
            # If text: use the pipeline with the OpenAIGenerator component
            logging.debug(f"MetadataEnricher - Processing as text, llm_kwargs: {llm_kwargs}")
            llm = OpenAIGenerator(generation_kwargs=llm_kwargs)
            self.pipeline.add_component(name="llm", instance=llm)
            # We provide the last user message as the prompt and all preceding messages as context.
            result = self.pipeline.run(data={"llm": {"prompt": messages[-1].content, "messages": messages[:-1]}})
            metadata_str = result['llm']['replies'][0]
            logging.debug(f"MetadataEnricher - Raw LLM reply for text document: {metadata_str}")

            # Parse JSON and update metadata
            try:
                metadata = json.loads(metadata_str)
                document.meta.update(metadata)
                logging.debug(f"MetadataEnricher - Successfully updated metadata for text document: {document.meta.get('ID')}")
            except json.JSONDecodeError as e:
                logging.error(f"MetadataEnricher - JSONDecodeError: {e} for document {document.meta.get('ID')}")
            except ValidationError as e:
                logging.error(f"MetadataEnricher - Pydantic validation error: {e} for document {document.meta.get('ID')}")
            except Exception as e:
                logging.error(f"MetadataEnricher - Unexpected error: {e} for document {document.meta.get('ID')}")
            self.pipeline.remove_component("llm")
            return document


    @component.output_types(documents=List[Document])
    def run(self, documents: List[Document]):
        """Main entry point of the MetadataEnricher component.
        
        Given a list of documents, concurrently process each one:
        1. Determine if it's image or text.
        2. Create and send prompts to OpenAI.
        3. Parse and apply the returned JSON metadata.
        4. Return the enriched documents.
        """
        logging.debug(f"MetadataEnricher - Running with {len(documents)} documents")
        documents_with_meta = []
        import concurrent.futures
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Process documents in parallel for efficiency
            futures = [executor.submit(self._process_document, document) for document in documents]
            for future in concurrent.futures.as_completed(futures):
                documents_with_meta.append(future.result())

        return {"documents": documents_with_meta}


DEBUG:haystack.core.component.component:Registering <class '__main__.BaseOpenAIGenerator'> as a component
DEBUG:haystack.core.component.component:Component __main__.BaseOpenAIGenerator is already registered. Previous imported from '<class '__main__.BaseOpenAIGenerator'>',                 new imported from '<class '__main__.BaseOpenAIGenerator'>'


DEBUG:haystack.core.component.component:Registered Component <class '__main__.BaseOpenAIGenerator'>
DEBUG:haystack.core.component.component:Registering <class '__main__.OpenAIGenerator'> as a component
DEBUG:haystack.core.component.component:Component __main__.OpenAIGenerator is already registered. Previous imported from '<class '__main__.OpenAIGenerator'>',                 new imported from '<class '__main__.OpenAIGenerator'>'
DEBUG:haystack.core.component.component:Registered Component <class '__main__.OpenAIGenerator'>
DEBUG:haystack.core.component.component:Registering <class '__main__.MetadataEnricher'> as a component
DEBUG:haystack.core.component.component:Component __main__.MetadataEnricher is already registered. Previous imported from '<class '__main__.MetadataEnricher'>',                 new imported from '<class '__main__.MetadataEnricher'>'
DEBUG:haystack.core.component.component:Registered Component <class '__main__.MetadataEnricher'>


In [5]:
with open('../data/prototyping/new.json', 'r') as file:
    data = json.load(file)
    

In [6]:
docs = []   
for item in data:
    if item.get('VorT') == 'visueel':
        item['Full text'] = item.get('Beschrijving', '')
    item_meta = {k: v for k, v in item.items() if k != 'Full text'}
    docs.append(Document(content=item['Full text'], meta=item_meta))

In [7]:
valid_docs = [doc for doc in docs if doc.content or doc.meta.get('VorT') == 'visueel']
exclusive_visual = [doc for doc in valid_docs if doc.meta.get('VorT') == 'visueel']

In [8]:
import concurrent

from tqdm import tqdm


def get_openai_client() -> OpenAI:
    return OpenAI()

def translate(text: str) -> str:
    client = get_openai_client()
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": "Je bent een expert in het vertalen van teksten van oud-nederlands naar Nederland. Je krijgt een collectie aan HTR teksten die je meot vertalen naar het hedendaags Nederlands. Het kan zijn dat er stukken tekst bij zitten die niet volledig te lezen zijn, of niet volledig te bevatten zijn. In dat geval kan je het laten staan zonder bewerking. Geef ENKEL de vertaling terug, geen 'alsjeblieft', 'hier', etc. "},
            {"role": "system", "content": "Als er veel typfouten in zitten zeg je hier niks van. Laat de tekst onbewerkt en probeer de tekst die wel leesbaar is zo goed mogelijk te vertalen. Vervang onleesbaar met ___(onleesbaar)___. Vervang verder NOOIT een stuk tekst met een placeholder."},
            {"role": "system", "content": "Als je een input krijgt, vertaal je deze en zeg je verder niks anders. Behoud de tekst zoals het is, vervang geen stukken, geen interpretaties, geen opvulling, geen [...], (hetzelfde) of andere opvullende tekens behalve bij onleesbare tekst. De tekst blijft zo origineel mogelijk"},
            {"role": "user", "content": f"{text}"},
        ],
    )
    return response.choices[0].message.content

@component
class DocumentTranslator:
    @component.output_types(docs=List[Document])
    def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
        translated_documents = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            futures = []
            for document in documents:
                current_text = document.content
                future = executor.submit(translate, current_text)
                futures.append((document, future))

            for document, future in tqdm(futures):
                translated = future.result()
                document.content = translated
                translated_documents.append(document)
        return {"docs": translated_documents}



DEBUG:haystack.core.component.component:Registering <class '__main__.DocumentTranslator'> as a component
DEBUG:haystack.core.component.component:Registered Component <class '__main__.DocumentTranslator'>


In [9]:
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack.utils import Secret
from haystack.document_stores.types.policy import DuplicatePolicy
from haystack.components.writers import DocumentWriter
import os
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter


def create_docstore() -> PineconeDocumentStore:
    return PineconeDocumentStore(
        api_key=Secret.from_env_var("PINECONE_API_KEY"),
        index="archiefutrecht-vertaald", # is nu statisch, raad aan gewoon in .env te zetten
        dimension=1536, # text-embedding-3-small
    )

def create_document_embedder() -> OpenAIDocumentEmbedder:
    return OpenAIDocumentEmbedder(
        model="text-embedding-3-small",
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
        meta_fields_to_embed=[
    "Title", "Description", "PrimaryQuestion", "PrimaryTheme", 
    "SecondaryThemes", "Entities", "EntityRelationships", "TimePeriod", 
    "Location", "AssetType", "StorylineDimension", "NarrativeFocus", 
    "Keywords", "ExplorationTags", "FollowUpQuestionTags", "FullText", 
    "Summary", "Sender", "Recipient", "DateSent", "LetterType", 
    "ContentSummary", "Scale", "MapFeatures", "DateCreated", 
    "LocationCovered", "Photographer", "DateTaken", "Event", 
    "PeopleInPhoto", "ArticleTopic", "Author", "PublicationDate", "Source"
] # Zorgt ervoor dat niet alleen tekst in embedding wordt meegenomen maar ook gespecificeerde metadata. Vet handig voor als je belangrijke metadata genereert.
    )
    
def create_document_writer(docstore) -> DocumentWriter:
    return DocumentWriter(document_store=docstore, policy=DuplicatePolicy.OVERWRITE) 


DEBUG:haystack.core.component.component:Registering <class 'haystack.components.embedders.azure_document_embedder.AzureOpenAIDocumentEmbedder'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.embedders.azure_document_embedder.AzureOpenAIDocumentEmbedder'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.embedders.azure_text_embedder.AzureOpenAITextEmbedder'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.embedders.azure_text_embedder.AzureOpenAITextEmbedder'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.embedders.hugging_face_api_document_embedder.HuggingFaceAPIDocumentEmbedder'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.embedders.hugging_face_api_document_embedder.HuggingFaceAPIDocumentEmbedder'>
DEBUG:haystack.core.component.component:Registering 

In [11]:
test = Pipeline()


test.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=15, split_overlap=3))
test.add_component("metadata", MetadataEnricher(metadata_model=Metadata))
test.add_component("embedder", create_document_embedder())
test.add_component("writer", create_document_writer(create_docstore()))
# test.add_component("translator", DocumentTranslator())
# test.connect("splitter", "translator")
# test.connect("translator", "metadata")
test.connect("splitter", "metadata")
test.connect("metadata", "embedder")
test.connect("embedder", "writer")

docs = test.run(data={"splitter": {"documents": valid_docs}}, include_outputs_from=["metadata"])

DEBUG:haystack.core.pipeline.base:Adding component 'splitter' (<haystack.components.preprocessors.document_splitter.DocumentSplitter object at 0x000001ED3F069190>

Inputs:
  - documents: List[Document]
Outputs:
  - documents: List[Document])
DEBUG:haystack.core.pipeline.base:Adding component 'metadata' (<__main__.MetadataEnricher object at 0x000001ED40D6D510>

Inputs:
  - documents: List[Document]
Outputs:
  - documents: List[Document])
DEBUG:haystack.core.pipeline.base:Adding component 'embedder' (<haystack.components.embedders.openai_document_embedder.OpenAIDocumentEmbedder object at 0x000001ED40F27690>

Inputs:
  - documents: List[Document]
Outputs:
  - documents: List[Document]
  - meta: Dict[str, Any])
DEBUG:haystack.core.pipeline.base:Adding component 'writer' (<haystack.components.writers.document_writer.DocumentWriter object at 0x000001ED411D1550>

Inputs:
  - documents: List[Document]
  - policy: Optional[DuplicatePolicy]
Outputs:
  - documents_written: int)
DEBUG:haystack.cor

ValueError: A component named 'llm' already exists in this pipeline: choose another name.

In [None]:
docs

{'embedder': {'meta': {'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 659491, 'total_tokens': 659491}}},
 'writer': {'documents_written': 360},
 'metadata': {'documents': [Document(id=7a8c11d14008b4d553fb50426115ede769f682112439b2074ea6331c6d639129, content: 'Extracten.
   Veneris den 12e. Junij 1665.
   Ontfangen een missiue van Hr. van Ameronge, Exs. minister va...', meta: {'Keuze': 'JA', 'ID': 722144, 'AET_ID': 4, 'num_scans': 186, 'invnr': 1001.2853, 'GUID': '609C5B9956E34642E0534701000A17FD', 'beschrijving': 'Nouvelles", ingekomen berichten van waarnemers over plaatselijke en militaire ontwikkelingen, 1659-1691 2853.  1659, 1663, 1665 april-aug', 'link': 'https://hetutrechtsarchief.nl/collectie/609C5B9956E34642E0534701000A17FD', 'representatieve\nafbeelding': 'https://proxy.archieven.nl/large/39/609C5C2FF2A64642E0534701000A17FD', 'Thumb': '', 'Soort / brontype': 'Nouvelles', 'Bron subtype': '', 'Personen': 'Waarnemers (onbekende personen genoemd in "Nouvelles").'