## Generic imports

In [14]:
from dotenv import load_dotenv
import logging
import os
import sys

In [15]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [16]:
if not load_dotenv():
    logger.error("No .env file found")

## Setup Haystack pipeline for indexing

### Basic imports for pipeline

In [17]:
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack.utils import Secret
from haystack.document_stores.types.policy import DuplicatePolicy
from haystack.components.writers import DocumentWriter
import os

### Creating some classes
Haystack does not allow repeated use of the same instance in its pipelines. To combat, I readily create methods here to instantiate all (or some) classes required in my pipeline

In [18]:

def create_docstore() -> PineconeDocumentStore:
    return PineconeDocumentStore(
        api_key=Secret.from_env_var("PINECONE_API_KEY"),
        index="archiefutrecht", # is nu statisch, raad aan gewoon in .env te zetten
        dimension=1536, # text-embedding-3-small
    )

def create_document_embedder() -> OpenAIDocumentEmbedder:
    return OpenAIDocumentEmbedder(
        model="text-embedding-3-small",
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
        meta_fields_to_embed=[] # Zorgt ervoor dat niet alleen tekst in embedding wordt meegenomen maar ook gespecificeerde metadata. Vet handig voor als je belangrijke metadata genereert.
    )
    
def create_document_writer(docstore) -> DocumentWriter:
    return DocumentWriter(document_store=docstore, policy=DuplicatePolicy.OVERWRITE) 


### Pipeline

In [19]:
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter

In [20]:
def create_indexing_pipeline() -> Pipeline:
    pipeline = Pipeline()
    
    converter = PyPDFToDocument()
    cleaner = DocumentCleaner()
    splitter = DocumentSplitter(split_by="sentence", split_length=3)
    # enricher = DocumentEnricher() Is for later when we have a defined set of metadata I can generate
    embedder = create_document_embedder()
    writer = create_document_writer(create_docstore())
    
    pipeline.add_component("converter", converter)
    pipeline.add_component("cleaner", cleaner)
    pipeline.add_component("splitter", splitter)
    # pipeline.add_component("enricher", enricher)
    pipeline.add_component("embedder", embedder)
    pipeline.add_component("writer", writer)
    
    pipeline.connect("converter", "cleaner")
    pipeline.connect("cleaner", "splitter")
    # pipeline.connect("splitter", "enricher")
    pipeline.connect("splitter", "embedder")
    pipeline.connect("embedder", "writer")
    
    return pipeline
    

### Processing PDFs

In [21]:
def get_doc_paths() -> list:
    paths = []
    for root, dirs, files in os.walk("../data/prototyping"):
        for file in files:
            if file.endswith(".pdf"):
                paths.append(os.path.join(root, file))
    return paths

def process_files_in_prototyping_folder() -> None: # Long name but it's descriptive :)
    pipeline = create_indexing_pipeline()
    paths = get_doc_paths()
    results = pipeline.run(
        data={"converter" : {"sources": paths}},
        include_outputs_from=["converter", "cleaner", "splitter"]
    )
    print(results)
    

In [25]:
process_files_in_prototyping_folder()

2024-12-18 12:05:16,377 - haystack.core.pipeline.pipeline - INFO - Running component converter
2024-12-18 12:05:16,991 - haystack.core.pipeline.pipeline - INFO - Running component cleaner
2024-12-18 12:05:16,991 - haystack.core.pipeline.pipeline - INFO - Running component splitter
2024-12-18 12:05:16,992 - haystack.core.pipeline.pipeline - INFO - Running component embedder
Calculating embeddings: 0it [00:00, ?it/s]2024-12-18 12:05:17,234 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 401 Unauthorized"
2024-12-18 12:05:17,235 - haystack.components.embedders.openai_document_embedder - ERROR - Failed embedding of documents 52eede293bf4a2f962897130ddf6998716d1e73b3ec833b8a79a92425dc31cba caused by Error code: 401 - {'error': {'message': 'You do not have access to the organization tied to the API key.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_organization'}}
Calculating embeddings: 1it [00:00,  4.13it/s]
2024-12-18 12:05:17,236 - 

{'embedder': {'meta': {}}, 'writer': {'documents_written': 1}, 'converter': {'documents': [Document(id=1c1a4014474742c65f5c84c76935b81f1c6e6c3066e5d1cc3e687c64e7b36e86, content: 'Inleiding
Het huisAmerongenenzijnbewoners
Deeerstevermeldingvanhet huisteAmerongenisuit 1286. Degraa...', meta: {'file_path': '../data/prototyping\\amerongen_intro.pdf'})]}, 'cleaner': {'documents': [Document(id=6da3250e6f903f7100f31d9135f683584a760c6094cd5ebd978bbe10db7fd6e6, content: 'Inleiding
Het huisAmerongenenzijnbewoners
Deeerstevermeldingvanhet huisteAmerongenisuit 1286. Degraa...', meta: {'file_path': '../data/prototyping\\amerongen_intro.pdf'})]}, 'splitter': {'documents': [Document(id=52eede293bf4a2f962897130ddf6998716d1e73b3ec833b8a79a92425dc31cba, content: 'Inleiding
Het huisAmerongenenzijnbewoners
Deeerstevermeldingvanhet huisteAmerongenisuit 1286. Degraa...', meta: {'file_path': '../data/prototyping\\amerongen_intro.pdf', 'source_id': '6da3250e6f903f7100f31d9135f683584a760c6094cd5ebd978bbe10db7f




## Removing data from docstore

In [24]:
docstore = create_docstore()

to_delete = []

filepath_filter = {
    "field": "meta.file_path",
    "operator": "==",
    "value": "../data/prototyping\\amerongen_intro.pdf"
}

docs = docstore.filter_documents(filepath_filter)

doc_ids = [doc.id for doc in docs]

docstore.delete_documents(doc_ids)

2024-12-18 12:05:10,903 - pinecone_plugin_interface.logging - INFO - Discovering subpackages in _NamespacePath(['e:\\programming\\HUA-rag\\.venv\\Lib\\site-packages\\pinecone_plugins'])
2024-12-18 12:05:10,904 - pinecone_plugin_interface.logging - INFO - Looking for plugins in pinecone_plugins.inference
2024-12-18 12:05:10,904 - pinecone_plugin_interface.logging - INFO - Installing plugin inference into Pinecone
2024-12-18 12:05:11,253 - haystack_integrations.document_stores.pinecone.document_store - INFO - Connecting to existing index archiefutrecht. `dimension`, `spec`, and `metric` will be ignored.
