In [2]:
# !pip install weaviate-client
# !pip install llama_index
# !pip install openai
# !pip install --upgrade pydantic==1.10.0 typing-extensions==4.5.0
# !pip install fastcore

In [2]:
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
from llama_index import VectorStoreIndex, StorageContext
from llama_index import get_response_synthesizer
from llama_index.indices.postprocessor import SimilarityPostprocessor, PrevNextNodePostprocessor
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.embeddings import HuggingFaceEmbedding
import weaviate, os
from weaviate import EmbeddedOptions
from openai import OpenAI
import pandas as pd
import numpy as np
from llama_index import ServiceContext
from llama_index.llms import OpenAI as OpenAILLama
from llama_index.embeddings import OpenAIEmbedding
from llama_index import download_loader
from configparser import ConfigParser
import openai
import os
from pathlib import Path

## OpenAI conifg

In [3]:
config=ConfigParser()
config.read('conf/conf.ini')
os.environ["OPENAI_API_KEY"] = config['openai']['apikey']

## Load docs

In [3]:
PandasExcelReader = download_loader("PandasExcelReader")

  for item in lines:


In [4]:
loader = PandasExcelReader(pandas_config={"header": 0})

In [7]:
documents = loader.load_data(file=Path('data/riigikogu/ems_subset.xlsx'))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  numpy.float,


In [9]:
len(documents)

1

## Vector store index

In [14]:
#this could be added to vectorstoreindex
embed_model=HuggingFaceEmbedding(model_name='intfloat/multilingual-e5-base')

service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
#NB! as embedding not set from previous model defaultsto OpenAi embedding model!
index = VectorStoreIndex.from_documents(documents)

In [17]:
query_engine = index.as_query_engine()

In [19]:
resp=query_engine.query('Kas makse tuleb tõsta?')

In [20]:
resp.response

'Makse tuleb tõsta.'

In [21]:
len(resp.source_nodes)

2

In [23]:
resp=query_engine.query('Miks tuleb makse tõsta? Vasta eesti keeles')
resp.response

'Makse tuleb tõsta selleks, et tagada riigi rahaline stabiilsus ja võimaldada riigil täita oma ülesandeid ja kohustusi. Maksumäärade tõstmine võib olla vajalik, et katta riigi kulutused erinevatele valdkondadele, nagu haridus, tervishoid, infrastruktuur jne. Samuti võib makse tõsta selleks, et vähendada eelarvepuudujääki või katta riigi võlga. Maksete tõstmisel tuleb aga arvestada ka sellega, et see ei tohiks koormata ülemäära ettevõtteid ega kodanikke ning peaks olema õiglane ja tasakaalustatud.'

In [25]:
len(resp.source_nodes)

2

In [24]:
resp=query_engine.query('Miks tuleb makse tõsta?')
resp.response

'Makse tuleb tõsta selleks, et tagada õiglane ühiskond ja rahalise kapitali õiglane jaotus. Samuti võib makse tõsta eelarve kosumiseks ja sotsiaalmaksu vähendamiseks.'

In [26]:
resp=query_engine.query('Miks tuleb makse tõsta?')
resp.response

'Makse tuleb tõsta selleks, et tagada õiglasem ühiskond ja suurendada eelarvetulude mahtu. Samuti võib makse tõsta selleks, et kompenseerida riigi kulutusi ja toetada erinevaid avalikke teenuseid ning investeeringuid.'

In [28]:
resp.get_formatted_sources()

'> Source (Doc id: f5720e3c-2e63-4994-9cfc-05ccf5327424): saa elu kaua jätkata üldse ma arvan et tuleks varade maksustamisel aluseks võtta mingi alampiir s...\n\n> Source (Doc id: e7dc542a-cf33-4578-812d-aea022c09d21): see peab olema selge\nja arusaadav ettevõtted on nõus seda maksma aga see süsteem ei tohiks olla k...'

In [32]:
len(index.docstore.docs)

212

In [36]:
# index.docstore.docs

In [37]:
resp=query_engine.query('Mis saab keskkonnast?')
resp.response

'Keskkonnast saab suur oht loodusele, eriti seoses põhjavee saastamisega. Põllumajanduse kasutatavad väetised ja kemikaalid on suurem probleem kui metsade raiumine. Üks võimalik lahendus on soodustada tervislikumat toitumist ja vähendada põhjavee mürgitamist, näiteks käibemaksuvabastusega ökoloogiliselt puhastele toiduainetele. Lisaks on arutlusel ka varade maksustamine, et vähendada palgalt võetavaid makse ja suurendada varadelt võetavaid makse. Mõned soovitavad maksustada suuremat rikkust, näiteks vara väärtusest lähtuvalt. Lisaks on tulevikus oodata uusi makse, nagu digimaks, et koguda maksutulu.'

## Save index to disk

In [39]:
# Persist index to disk
index.storage_context.persist("riigikogu_index")

## Reload index

In [4]:
from llama_index import StorageContext, load_index_from_storage

# Rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="riigikogu_index")

# Load index from the storage context
index = load_index_from_storage(storage_context)

In [5]:
query_engine = index.as_query_engine()

## Tools

In [38]:
#https://github.com/run-llama/llama_index/blob/main/docs/examples/query_transformations/query_transform_cookbook.ipynb

In [20]:
#https://docs.google.com/presentation/d/1IJ1bpoLmHfFzKM3Ef6OoWGwvrwDwLV7EcoOHxLZzizE/edit#slide=id.g23d546514bd_0_290
#https://docs.llamaindex.ai/en/stable/module_guides/querying/router/root.html

In [6]:
from llama_index.selectors.llm_selectors import (
    LLMSingleSelector,
    LLMMultiSelector,
)
from llama_index.selectors.pydantic_selectors import (
    PydanticMultiSelector,
    PydanticSingleSelector,
)

In [7]:
selector = LLMMultiSelector.from_defaults()

In [8]:
from llama_index.tools.types import ToolMetadata

tool_choices = [
    ToolMetadata(
        name="count_of_events",
        description=("This tool counts events which match query string"),
    ),
    ToolMetadata(
        name="changes_in_events",
        description=("This tool finds most significant changes in events which match query string"),
    ),
    ToolMetadata(
        name="show_sample_events",
        description=("This tool returns sample events which match query string"),
    ),
]

In [16]:
from IPython.display import Markdown, display


# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [11]:
display_prompt_dict(selector.get_prompts())

**Prompt Key**: prompt<br>**Text:** <br>

Some choices are given below. It is provided in a numbered list (1 to {num_choices}), where each item in the list corresponds to a summary.
---------------------
{context_list}
---------------------
Using only the choices above and not prior knowledge, return the top choices (no more than {max_outputs}, but only select what is needed) that are most relevant to the question: '{query_str}'


The output should be ONLY JSON formatted as a JSON instance.

Here is an example:
[
    {{
        choice: 1,
        reason: "<insert reason for choice>"
    }},
    ...
]



<br><br>

In [12]:
selector_result = selector.select(
    tool_choices, query="How many events were there between May-August 2023?"
)

In [13]:
selector_result.selections

[SingleSelection(index=0, reason='This tool counts events which match query string. It can be used to count the number of events that occurred between May-August 2023 by providing the appropriate query string.'),
 SingleSelection(index=2, reason='This tool returns sample events which match query string. Although it may not provide the exact count of events, it can give a sample of events that occurred between May-August 2023.')]

## Query Transformation with ReAct Prompt

In [21]:
from llama_index.agent.react.formatter import ReActChatFormatter
from llama_index.agent.react.output_parser import ReActOutputParser
from llama_index.tools import FunctionTool
from llama_index.llms.types import ChatMessage

In [23]:
def get_count(text: str) -> int:
    """Given a text string finds number of events in database and returns the answer."""
    # NOTE: This is a mock function
    return 100


def get_changes(text: str) -> str:
    """Given a text string finds events in database and returns most significant changes in period"""
    return "channel x events growth was 100%"


tool1 = FunctionTool.from_defaults(fn=get_count)
tool2 = FunctionTool.from_defaults(fn=get_changes)
tools = [tool1, tool2]

In [30]:
chat_formatter = ReActChatFormatter()
output_parser = ReActOutputParser()
input_msgs = chat_formatter.format(
    tools,
    [
        ChatMessage(
            content="Can you tell me what is the number of events which talk about changing taxes?",
            role="user",
        )
    ],
)
input_msgs

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content='\nYou are designed to help with a variety of tasks, from answering questions     to providing summaries to other types of analyses.\n\n## Tools\nYou have access to a wide variety of tools. You are responsible for using\nthe tools in any sequence you deem appropriate to complete the task at hand.\nThis may require breaking the task into subtasks and using different tools\nto complete each subtask.\n\nYou have access to the following tools:\n> Tool Name: get_count\nTool Description: get_count(text: str) -> int\nGiven a text string finds number of events in database and returns the answer.\nTool Args: {\'title\': \'get_count\', \'type\': \'object\', \'properties\': {\'text\': {\'title\': \'Text\', \'type\': \'string\'}}, \'required\': [\'text\']}\n\n> Tool Name: get_changes\nTool Description: get_changes(text: str) -> str\nGiven a text string finds events in database and returns most significant changes in period\nTool Args: {\'ti

In [31]:
llm = OpenAILLama(model="gpt-3.5-turbo", temperature=0.1)
response = llm.chat(input_msgs)

In [32]:
response

ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Thought: I need to use a tool to help me answer the question.\nAction: get_count\nAction Input: {"text": "changing taxes"}', additional_kwargs={}), raw={'id': 'chatcmpl-8VwSya02IJ4eQzhTxCdOJZMwOqg69', 'choices': [Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Thought: I need to use a tool to help me answer the question.\nAction: get_count\nAction Input: {"text": "changing taxes"}', role='assistant', function_call=None, tool_calls=None))], 'created': 1702623604, 'model': 'gpt-3.5-turbo-0613', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=30, prompt_tokens=523, total_tokens=553)}, delta=None, additional_kwargs={})

In [33]:
reasoning_step = output_parser.parse(response.message.content)
reasoning_step.action_input

{'text': 'changing taxes'}

## Query Rewriting (using QueryTransform)

In [5]:
from llama_index.indices.query.query_transform import HyDEQueryTransform
from llama_index.llms import OpenAI

In [6]:
hyde = HyDEQueryTransform(include_original=True)
llm = OpenAI(model="gpt-3.5-turbo")

query_bundle = hyde.run("What is Bel?")

In [8]:
query_bundle 

QueryBundle(query_str='What is Bel?', image_path=None, custom_embedding_strs=['Bel is a term that has multiple meanings and can be interpreted in various ways depending on the context. In ancient Mesopotamian mythology, Bel was a prominent deity and the god of the heavens and earth. He was considered one of the most powerful gods in the pantheon and was often associated with the city of Babylon. Bel was believed to have control over natural phenomena such as storms, rain, and fertility, and was worshipped by the Babylonians through elaborate rituals and offerings.\n\nIn addition to its mythological significance, Bel is also a title that has been used throughout history to refer to different individuals or positions of authority. For instance, in ancient Babylon, the title of Bel was given to the ruler or king, signifying their divine status and authority over the land. This title was also used in other ancient civilizations, such as the Hittites and the Assyrians, to denote a high-rank

## Sub questions

In [6]:
# from llama_index.question_gen import (
#     LLMQuestionGenerator,
#     OpenAIQuestionGenerator,
# )
from llama_index.question_gen.llm_generators import LLMQuestionGenerator
from llama_index.question_gen.openai_generator import OpenAIQuestionGenerator
from llama_index.llms import OpenAI

In [14]:
llm = OpenAI()
question_gen = OpenAIQuestionGenerator.from_defaults(llm=llm)

In [17]:
display_prompt_dict(question_gen.get_prompts())

**Prompt Key**: question_gen_prompt<br>**Text:** <br>

You are a world class state of the art agent.

You have access to multiple tools, each representing a different data source or API.
Each of the tools has a name and a description, formatted as a JSON dictionary.
The keys of the dictionary are the names of the tools and the values are the descriptions.
Your purpose is to help answer a complex user question by generating a list of sub questions that can be answered by the tools.

These are the guidelines you consider when completing your task:
* Be as specific as possible
* The sub questions should be relevant to the user question
* The sub questions should be answerable by the tools provided
* You can generate multiple sub questions for each tool
* Tools must be specified by their name, not their description
* You don't need to use a tool if you don't think it's relevant

Output the list of sub questions by calling the SubQuestionList function.

## Tools
```json
{tools_str}
```

## User Question
{query_str}



<br><br>

In [30]:
from llama_index.tools.types import ToolMetadata


tool_choices = [
     ToolMetadata(
        name="count_of_events",
        description=("This tool counts events which match query string"),
    ),
    ToolMetadata(
        name="changes_in_events",
        description=("This tool finds most significant changes in events which match query string"),
    ),
    ToolMetadata(
        name="show_sample_events",
        description=("This tool returns sample events which match query string"),
    ),
]



In [31]:
from llama_index.schema import QueryBundle

query_str = "What were the most significant changes in calls in 2023?"
choices = question_gen.generate(tool_choices, QueryBundle(query_str=query_str))

In [32]:
choices

[SubQuestion(sub_question='How many calls were made in 2023?', tool_name='count_of_events'),
 SubQuestion(sub_question='What were the changes in calls in 2023?', tool_name='changes_in_events'),
 SubQuestion(sub_question='Can you show me a sample of calls made in 2023?', tool_name='show_sample_events')]

## Version 2 for sub questions

In [32]:
from llama_index.tools import FunctionTool

def count_of_events(text: str) -> str:
    """Given a text string finds number of events in database and returns the answer."""
    return f"Number of events containing '{text}' is: '101'"


def changes_in_events(text: str) -> str:
    """Given a text string finds events in database and returns most significant changes in period"""
    print('here')
    return f"Most significant changes in events containing '{text}' are: 'number of calls rose 100%'"

def show_sample_events(text: str, n:int=5) -> str:
    """Given a text string finds events and returns n of them"""
    return f"Events which contain '{text}' are: 'event1', 'event2'"


count_of_events_tool = FunctionTool.from_defaults(fn=count_of_events)
changes_in_events_tool = FunctionTool.from_defaults(fn=changes_in_events)
show_sample_events_tool = FunctionTool.from_defaults(fn=show_sample_events)

tools=[count_of_events_tool, changes_in_events_tool, show_sample_events_tool]

In [26]:
count_of_events_tool

<llama_index.tools.function_tool.FunctionTool at 0x7f876e375850>

In [17]:
# Load index from the storage context
# index = load_index_from_storage(storage_context)

In [18]:
# query_engine = index.as_query_engine()

In [19]:
import nest_asyncio
nest_asyncio.apply()

In [29]:
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine

# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="count_of_events",
        description=("This tool counts events which match query string"),
            fn_schema=count_of_events_tool
        ),
    ),
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="changes_in_events_tool",
        description=("This tool finds most significant changes in events which match query string"),
        ),
    ),
     QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="show_sample_events_tool",
        description=("This tool returns sample events which match query string"),
        ),
    ),
]


from llama_index.callbacks import CallbackManager, LlamaDebugHandler

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

service_context = ServiceContext.from_defaults(
    callback_manager=callback_manager
)


query_engine_sub = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    use_async=True,
)

In [30]:
response = query_engine_sub.query(
    "What changes have happened with in calls which talk about 'bill'?"
)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[count_of_events] Q: How many events match the query string 'bill'?
[0m[1;3;38;2;90;149;237m[changes_in_events_tool] Q: What are the most significant changes in events that match the query string 'bill'?
[0m[1;3;38;2;11;159;203m[show_sample_events_tool] Q: Can you show me some sample events that match the query string 'bill'?
[0m[1;3;38;2;90;149;237m[changes_in_events_tool] A: The context information does not provide any information about significant changes in events related to a bill.
[0m[1;3;38;2;11;159;203m[show_sample_events_tool] A: I'm sorry, but based on the given context information, I cannot provide any sample events that match the query string 'bill'.
[0m[1;3;38;2;237;90;200m[count_of_events] A: Based on the given context information, there is no mention of any events related to a 'bill'. Therefore, the number of events that match the query string 'bill' is zero.
[0m**********
Trace: query
    |_query ->  8.655426 s

## Agent

In [34]:
from llama_index.agent import OpenAIAgent

In [35]:
agent = OpenAIAgent.from_tools(tools, verbose=True)

In [36]:
agent.chat(  "What changes have happened with in calls which talk about 'bill'?")

STARTING TURN 1
---------------

=== Calling Function ===
Calling function: changes_in_events with args: {
  "text": "calls about bill"
}
here
Got output: Most significant changes in events containing 'calls about bill' are: 'number of calls rose 100%'

STARTING TURN 2
---------------



AgentChatResponse(response="The most significant change in events related to calls about 'bill' is that the number of calls has increased by 100%.", sources=[ToolOutput(content="Most significant changes in events containing 'calls about bill' are: 'number of calls rose 100%'", tool_name='changes_in_events', raw_input={'args': (), 'kwargs': {'text': 'calls about bill'}}, raw_output="Most significant changes in events containing 'calls about bill' are: 'number of calls rose 100%'")], source_nodes=[])