In [1]:
from dotenv import load_dotenv
import os

load_dotenv(".env")

open_ai_key = os.getenv("OPENAI_API_KEY")

In [2]:
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.generators.utils import print_streaming_chunk

  from .autonotebook import tqdm as notebook_tqdm


### Build indexing pipeline

In [8]:
from haystack import Pipeline, Document
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
import pandas as pd
from haystack.components.preprocessors import DocumentCleaner
from haystack.document_stores.types import DuplicatePolicy


In [4]:
df = pd.read_csv("df_file.csv")

df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [5]:
list_of_news = df['Text'].to_list()

documents = [Document(id=str(i), content=list_of_news[i]) for i in range(len(list_of_news))]


In [9]:

document_store = ElasticsearchDocumentStore(hosts = "http://localhost:9200",
                                            embedding_similarity_function='cosine')

embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_cleaner = DocumentCleaner(
                                remove_empty_lines=True,
                                remove_extra_whitespaces=True,
                                remove_repeated_substrings=True,
                                remove_substrings=['\n']
                            )
document_writer = DocumentWriter(document_store=document_store,
                                 policy=DuplicatePolicy.OVERWRITE )

indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=embedder, name="doc_embedder")
indexing_pipeline.add_component(instance=document_cleaner, name='doc_cleaner')
indexing_pipeline.add_component(instance=document_writer, name="doc_writer")

indexing_pipeline.connect("doc_cleaner.documents", "doc_embedder.documents")
indexing_pipeline.connect("doc_embedder.documents", "doc_writer.documents")


<haystack.core.pipeline.pipeline.Pipeline object at 0x1530973b0>
🚅 Components
  - doc_embedder: SentenceTransformersDocumentEmbedder
  - doc_cleaner: DocumentCleaner
  - doc_writer: DocumentWriter
🛤️ Connections
  - doc_embedder.documents -> doc_writer.documents (List[Document])
  - doc_cleaner.documents -> doc_embedder.documents (List[Document])

In [10]:
# Run pipeline
indexing_pipeline.run({"doc_cleaner": {"documents": documents}})


Batches: 100%|██████████| 70/70 [01:08<00:00,  1.03it/s]


{'doc_writer': {'documents_written': 2225}}

### Build RAG pipeline

In [11]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator

In [12]:
template = [ChatMessage.from_system("""
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}
Question: {{ question }}
Answer:
""")]
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
query_retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
prompt_builder = ChatPromptBuilder(template=template)
llm = OpenAIChatGenerator(model="gpt-4o-mini")
rag_pipe = Pipeline()
rag_pipe.add_component(instance= query_embedder, name = "embedder" )
rag_pipe.add_component(instance=query_retriever, name="retriever")
rag_pipe.add_component(instance=prompt_builder, name="prompt_builder")
rag_pipe.add_component(instance=llm, name="llm" )

rag_pipe.connect("embedder.embedding", "retriever.query_embedding")
rag_pipe.connect("retriever", "prompt_builder.documents")
rag_pipe.connect("prompt_builder.prompt", "llm.messages")

<haystack.core.pipeline.pipeline.Pipeline object at 0x1532b3050>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: ElasticsearchEmbeddingRetriever
  - prompt_builder: ChatPromptBuilder
  - llm: OpenAIChatGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.messages (List[ChatMessage])

In [13]:
def rag_pipeline_func(query: str):
    result = rag_pipe.run({"embedder": {"text": query}, "prompt_builder": {"question": query}})

    return {"reply": result["llm"]["replies"][0].content}


In [14]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "rag_pipeline_func",
            "description": "Get information about where people live",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The query to use in the search. Infer this from the user's message. It should be a question or a statement",
                    }
                },
                "required": ["query"],
            },
        },
    },
]

In [15]:
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.generators.utils import print_streaming_chunk

messages = [
    ChatMessage.from_system(
        "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous."
    ),
    ChatMessage.from_user("Summarize news about Scottland"),
]

chat_generator = OpenAIChatGenerator(model="gpt-4o-mini", streaming_callback=print_streaming_chunk)
response = chat_generator.run(messages=messages, generation_kwargs={"tools": tools})

In [None]:
response

In [16]:
import json

## Parse function calling information
function_call = json.loads(response["replies"][0].text)[0]
function_name = function_call["function"]["name"]
function_args = json.loads(function_call["function"]["arguments"])
print("Function Name:", function_name)
print("Function Arguments:", function_args)

## Find the correspoding function and call it with the given arguments
available_functions = {"rag_pipeline_func": rag_pipeline_func}
function_to_call = available_functions[function_name]
function_response = function_to_call(**function_args)
print("Function Response:", function_response)


Function Name: rag_pipeline_func
Function Arguments: {'query': 'latest news about Scotland'}


Batches: 100%|██████████| 1/1 [00:00<00:00, 34.73it/s]


Function Response: {'reply': "The latest news about Scotland includes the introduction of a comprehensive smoking ban in public places, which will be enforced by Spring 2006. This ban aims to improve Scotland's health rates and reduce preventable deaths caused by smoking. Additionally, the Scottish Environment Protection Agency has warned that climate change could be uncontrollable within decades, leading to discussions about nuclear energy and wind farms as potential solutions. The Scottish Parliament is also considering options regarding super-casinos, with a decision expected soon on whether to allow Westminster to legislate on the matter. Furthermore, there is an ongoing campaign against plans to merge the Scottish regiments, with a group targeting several key marginal Labour seats in the upcoming general election. Lastly, Euan Murray has returned to the Scotland training squad for the Six Nations after completing an eight-week ban."}
