# Basic setup

In [1]:
from dotenv import load_dotenv
import logging
import os
import sys

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
if not load_dotenv():
    logger.error("No .env file found")

## Setup Haystack pipeline for querying

### Basic imports for pipeline

In [4]:
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack.utils import Secret
from haystack.document_stores.types.policy import DuplicatePolicy
from haystack.components.writers import DocumentWriter
import os

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:

def create_docstore() -> PineconeDocumentStore:
    return PineconeDocumentStore(
        api_key=Secret.from_env_var("PINECONE_API_KEY"),
        index="archiefutrecht", # is nu statisch, raad aan gewoon in .env te zetten
        dimension=1536, # text-embedding-3-small
    )

def create_document_embedder() -> OpenAIDocumentEmbedder:
    return OpenAIDocumentEmbedder(
        model="text-embedding-3-small",
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
        meta_fields_to_embed=[] # Zorgt ervoor dat niet alleen tekst in embedding wordt meegenomen maar ook gespecificeerde metadata. Vet handig voor als je belangrijke metadata genereert.
    )
    
def create_document_writer(docstore) -> DocumentWriter:
    return DocumentWriter(document_store=docstore, policy=DuplicatePolicy.OVERWRITE) 


## Pipeline setup

In [6]:
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack.components.embedders import OpenAITextEmbedder
from prompts import QUERY_REPHRASE_TEMPLATE, QUERY_ANSWER_TEMPLATE
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.converters import PyPDFToDocument
from haystack import Pipeline
from haystack.components.converters import OutputAdapter
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
def create_openai_generator() -> OpenAIGenerator:
    OpenAIGenerator()
    
# deze embedder heeft geen meta want we gaan enkel de vraag embedden voor similarity search
def create_document_embedder() -> OpenAIDocumentEmbedder:
    return OpenAIDocumentEmbedder(
        model="text-embedding-3-small",
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
    )
    
def create_text_embedder() -> OpenAITextEmbedder:
    return OpenAITextEmbedder(
        model="text-embedding-3-small",
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
    )
    
def create_docstore() -> PineconeDocumentStore:
    return PineconeDocumentStore(
        api_key=Secret.from_env_var("PINECONE_API_KEY"),
        index="archiefutrecht", # is nu statisch, raad aan gewoon in .env te zetten
        dimension=1536, # text-embedding-3-small
    )
    
def create_pinecone_retriever() -> PineconeEmbeddingRetriever:
    return PineconeEmbeddingRetriever(
        document_store=create_docstore()
    )
    
def create_llm_output_adapter() -> OutputAdapter:
    return OutputAdapter(
        template="{{ replies [0] }}",
        output_type=str
    )

In [8]:
def create_qa_pipeline() -> Pipeline:
    pipeline = Pipeline()

    query_rephrase_builder = PromptBuilder(template=QUERY_REPHRASE_TEMPLATE)
    answer_builder = PromptBuilder(template=QUERY_ANSWER_TEMPLATE)
    
    
    rephrase_llm = OpenAIGenerator()
    answer_llm = OpenAIGenerator()
    
    rephrase_output_adapter = create_llm_output_adapter()
    
    question_embedder = create_text_embedder()
    pinecone_retriever = create_pinecone_retriever()

    pipeline.add_component("query_rephrase_builder", query_rephrase_builder)
    pipeline.add_component("rephrase_output_adapter", rephrase_output_adapter)
    pipeline.add_component("answer_builder", answer_builder)
    pipeline.add_component("rephrase_llm", rephrase_llm)
    pipeline.add_component("answer_llm", answer_llm)
    pipeline.add_component("question_embedder", question_embedder)
    pipeline.add_component("pinecone_retriever", pinecone_retriever)
    
    pipeline.connect("query_rephrase_builder", "rephrase_llm")
    pipeline.connect("rephrase_llm", "rephrase_output_adapter")
    pipeline.connect("rephrase_output_adapter", "question_embedder")
    pipeline.connect("question_embedder.embedding", "pinecone_retriever.query_embedding")
    pipeline.connect("pinecone_retriever", "answer_builder")
    pipeline.connect("answer_builder", "answer_llm")
    
    
    return pipeline
    

In [None]:
query="Wat is amerongen?"

response = pipeline = create_qa_pipeline().run(data={
    "query_rephrase_builder": {"query": query}
})

2024-12-18 12:24:01,526 - haystack.core.pipeline.pipeline - INFO - Running component query_rephrase_builder
2024-12-18 12:24:01,527 - haystack.core.pipeline.pipeline - INFO - Running component rephrase_llm
2024-12-18 12:24:02,349 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-12-18 12:24:02,353 - haystack.core.pipeline.pipeline - INFO - Running component rephrase_output_adapter
2024-12-18 12:24:02,355 - haystack.core.pipeline.pipeline - INFO - Running component question_embedder
2024-12-18 12:24:02,795 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-18 12:24:02,797 - haystack.core.pipeline.pipeline - INFO - Running component pinecone_retriever
2024-12-18 12:24:02,799 - pinecone_plugin_interface.logging - INFO - Discovering subpackages in _NamespacePath(['e:\\programming\\HUA-rag\\.venv\\Lib\\site-packages\\pinecone_plugins'])
2024-12-18 12:24:02,799 - pinecone_plugin_interface.logg