# Basic setup

In [1]:
from dotenv import load_dotenv
import logging
import os
import sys

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
if not load_dotenv():
    logger.error("No .env file found")

## Setup Haystack pipeline for querying

### Basic imports for pipeline

In [4]:
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack.utils import Secret
from haystack.document_stores.types.policy import DuplicatePolicy
from haystack.components.writers import DocumentWriter
import os

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:

def create_docstore() -> PineconeDocumentStore:
    return PineconeDocumentStore(
        api_key=Secret.from_env_var("PINECONE_API_KEY"),
        index="archiefutrecht", # is nu statisch, raad aan gewoon in .env te zetten
        dimension=1536, # text-embedding-3-small
    )

def create_document_embedder() -> OpenAIDocumentEmbedder:
    return OpenAIDocumentEmbedder(
        model="text-embedding-3-small",
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
        meta_fields_to_embed=[] # Zorgt ervoor dat niet alleen tekst in embedding wordt meegenomen maar ook gespecificeerde metadata. Vet handig voor als je belangrijke metadata genereert.
    )
    
def create_document_writer(docstore) -> DocumentWriter:
    return DocumentWriter(document_store=docstore, policy=DuplicatePolicy.OVERWRITE) 


## Pipeline setup

In [6]:
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from prompts import QUERY_REPHRASE_TEMPLATE, QUERY_ANSWER_TEMPLATE

In [None]:
def create_indexing_pipeline() -> Pipeline:
    pipeline = Pipeline()

    answer_builder = PromptBuilder()
    query_rephrase_builder = PromptBuilder()
    
    rephrase_llm = OpenAIGenerator()
    llm = OpenAIGenerator()
    
    question_embedder = OpenAITextEmbedder()
    
    pinecone_retriever = get_retriever()
    
    
    return pipeline
    