In [20]:
import os
import urllib.request

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack_integrations.components.embedders.ollama import OllamaTextEmbedder, OllamaDocumentEmbedder
from haystack_integrations.components.generators.ollama import OllamaChatGenerator
from haystack.components.generators.chat import AzureOpenAIChatGenerator
from haystack.components.writers import DocumentWriter
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage

In [30]:
# Read LAAMA_API_KEY and LAAMA_API_URL from .env
from dotenv import load_dotenv
load_dotenv()
LAAMA_API_KEY = os.getenv("LAAMA_API_TOKEN")
LAAMA_API_URL = os.getenv("LAAMA_API_URL")
EMBEDDER_MODEL = "snowflake-arctic-embed2"
CHAT_MODEL = "deepseek-r1:7b"

AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")
AZURE_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")

print(AZURE_DEPLOYMENT)
print(AZURE_ENDPOINT)
print(os.getenv("AZURE_OPENAI_API_KEY"))


curredev4omini
https://currechattest.openai.azure.com/
04307ef9d20746b48c2c7a29970314a4


In [32]:
llm = AzureOpenAIChatGenerator(
    azure_endpoint=AZURE_ENDPOINT,
    azure_deployment=AZURE_DEPLOYMENT,
)

document_store = InMemoryDocumentStore()

text_file_converter = TextFileToDocument()
cleaner = DocumentCleaner()
splitter = DocumentSplitter()
embedder = OllamaDocumentEmbedder(
    # api_key=LAAMA_API_KEY,
    url=LAAMA_API_URL,
    model=EMBEDDER_MODEL,
)
embedder._client._client.headers["token"] = LAAMA_API_KEY

writer = DocumentWriter(document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", text_file_converter)
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("converter.documents", "cleaner.documents")
indexing_pipeline.connect("cleaner.documents", "splitter.documents")
indexing_pipeline.connect("splitter.documents", "embedder.documents")
indexing_pipeline.connect("embedder.documents", "writer.documents")
indexing_pipeline.run(data={"sources": ["../data/ohtu/osa0.md", "../data/ohtu/osa1.md", "../data/ohtu/osa2.md" ]})

text_embedder = OllamaTextEmbedder(
    url=LAAMA_API_URL,
    model=EMBEDDER_MODEL,
)
text_embedder._client._client.headers["token"] = LAAMA_API_KEY

retriever = InMemoryEmbeddingRetriever(document_store)
prompt_template = [
    ChatMessage.from_user(
      """
      Given these documents, answer the question.
      Documents:
      {% for doc in documents %}
          {{ doc.content }}
      {% endfor %}
      Question: {{query}}
      Answer:
      """
    )
]
prompt_builder = ChatPromptBuilder(template=prompt_template)

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)

rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

# --- Print chunked documents after splitting ---
splitter_output = splitter.run({"documents": cleaner.run({"documents": text_file_converter.run({"sources": ["../data/ohtu/osa0.md", "../data/ohtu/osa1.md", "../data/ohtu/osa2.md"]})["documents"]})["documents"]})
print("Chunked documents:")
for doc in splitter_output["documents"]:
    print(doc.content)
    print("---")

# --- Print prompt fed to LLM ---
# Run up to prompt_builder
query = "Miten hyväksiluen miniprojektin?"
text_embedding = text_embedder.run({"text": query})["embedding"]
retrieved_docs = retriever.run({"query_embedding": text_embedding})["documents"]
prompt = prompt_builder.run({"documents": retrieved_docs, "query": query})["prompt"]
print("Prompt fed to LLM:")
print(prompt)

# --- Run the full pipeline as before ---
result = rag_pipeline.run(data={"prompt_builder": {"query":query}, "text_embedder": {"text": query}})
print(result["llm"]["replies"][0].text)


Calculating embeddings: 100%|██████████| 3/3 [00:05<00:00,  1.75s/it]
ChatPromptBuilder has 2 prompt variables, but `required_variables` is not set. By default, all prompt variables are treated as optional, which may lead to unintended behavior in multi-branch pipelines. To avoid unexpected execution, ensure that variables intended to be required are explicitly set in `required_variables`.
Could not read sources. Skipping it. Error: [Errno 2] No such file or directory: 'sources'


TypeError: DocumentCleaner expects a List of Documents as input.