In [83]:
%%capture --no-stderr
!pip install llama-index
!pip install llama-index-embeddings-huggingface

In [2]:
from llama_index.core import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex, SimpleDirectoryReader
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI

In [64]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [3]:
wiki_titles = ["Delhi", "Mumbai", "Bengaluru", "Hyderabad", "Chennai"]

In [4]:
from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [5]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()

In [11]:

from llama_index.embeddings.huggingface import (
    HuggingFaceEmbedding,
)
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [72]:
from llama_index.llms.openai import OpenAI
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.llm = llm

In [73]:
from llama_index.agent.openai import OpenAIAgent

In [74]:
# Build agents dictionary
agents = {}

for wiki_title in wiki_titles:
    # build vector index
    vector_index = VectorStoreIndex.from_documents(
        city_docs[wiki_title], llm=llm
    )
    # build summary index
    summary_index = SummaryIndex.from_documents(
        city_docs[wiki_title], llm=llm
    )
    # define query engines
    vector_query_engine = vector_index.as_query_engine()
    list_query_engine = summary_index.as_query_engine()

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for summarization questions related to"
                    f" {wiki_title}"
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=list_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    f"Useful for retrieving specific context from {wiki_title}"
                ),
            ),
        ),
    ]

    # build agent
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=llm,
        verbose=True,
    )

    agents[wiki_title] = agent

In [75]:
# define top-level nodes
nodes = []
for wiki_title in wiki_titles:
    # define index node that links to these agents
    wiki_summary = (
        f"This content contains Wikipedia articles about {wiki_title}. Use"
        " this index if you need to lookup specific facts about"
        f" {wiki_title}.\nDo not use this index if you want to analyze"
        " multiple cities."
    )
    node = IndexNode(text=wiki_summary, index_id=wiki_title)
    nodes.append(node)

In [76]:
# define top-level retriever
vector_index = VectorStoreIndex(nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=1)

In [77]:
# define recursive retriever
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import get_response_synthesizer

In [78]:
# note: can pass `agents` dict as `query_engine_dict` since every agent can be used as a query engine
recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    query_engine_dict=agents,
    verbose=True,
)

In [79]:
response_synthesizer = get_response_synthesizer(
    response_mode="compact",
)
query_engine = RetrieverQueryEngine.from_args(
    recursive_retriever,
    response_synthesizer=response_synthesizer,
    llm=llm,
)

In [80]:
response = query_engine.query("Summarize history of Delhi")

[1;3;34mRetrieving with query id None: Summarize history of Delhi
[0m[1;3;38;5;200mRetrieved node with id, entering: Delhi
[0m[1;3;34mRetrieving with query id Delhi: Summarize history of Delhi
[0mAdded user message to memory: Summarize history of Delhi
=== Calling Function ===
Calling function: summary_tool with args: {"input":"history of Delhi"}
Got output: Delhi has a rich historical background, having been the capital of major empires like the Delhi Sultanate and the Mughal Empire. The city has been associated with various rulers and dynasties over the centuries. The topography of the medieval fort Purana Qila is said to match the citadel Indraprastha from the Sanskrit epic Mahabharata. Delhi has been a significant center for Sufism, Qawwali music, and the development of languages like Urdu and Modern Standard Hindi. It was a notable center during the Indian Rebellion of 1857 and transformed from a Mughal city to a Punjabi one during the Partition of India in 1947. New Delhi b

In [81]:

response.response

'Delhi has a diverse historical past, serving as the capital for prominent empires such as the Delhi Sultanate and the Mughal Empire. The city has been under the rule of various dynasties and rulers throughout its history. It is believed that the medieval fort Purana Qila corresponds to the ancient citadel Indraprastha from the Mahabharata. Delhi has been a hub for Sufism, Qawwali music, and the evolution of languages like Urdu and Modern Standard Hindi. It played a significant role during the Indian Rebellion of 1857 and underwent a transformation from a Mughal city to a Punjabi one during the Partition of India in 1947. Following independence in 1947, New Delhi was established as the capital of the Dominion of India.'

In [82]:
response = query_engine.query("Who is current Mayor of Hyderabad city?")

[1;3;34mRetrieving with query id None: Who is current Mayor of Hyderabad city?
[0m[1;3;38;5;200mRetrieved node with id, entering: Hyderabad
[0m[1;3;34mRetrieving with query id Hyderabad: Who is current Mayor of Hyderabad city?
[0mAdded user message to memory: Who is current Mayor of Hyderabad city?
=== Calling Function ===
Calling function: vector_tool with args: {"input":"current Mayor of Hyderabad city"}
Got output: Gadwal Vijayalakshmi of Telangana Rashtra Samithi (TRS)

[1;3;32mGot response: The current Mayor of Hyderabad city is Gadwal Vijayalakshmi of Telangana Rashtra Samithi (TRS).
[0m