In [None]:
import os
import nest_asyncio
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
from raptor_pack.llama_index.packs.raptor.base import RaptorRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
import json
from datasets import load_dataset
from llama_index.core.schema import Document

os.environ["OPENAI_API_KEY"] = ""
nest_asyncio.apply()
with open('hotpotqa_corpus.json') as file:
	data = file.read()
	lines = json.loads(data)


output_directory = 'hotpot_temp_data'
os.makedirs(output_directory, exist_ok=True)

all_file, count = [], 0
for key in lines.keys():
    file_path = os.path.join(output_directory, f"{count}.txt")
    file_str = key + '\n'
    for sentence in lines[key]:
         file_str += sentence
    with open(file_path, 'w') as file:
         file.write(file_str)
    all_file.append(file_path)
    count += 1

corpus = json.load(open("hotpotqa_kg.json"))
documents = []
entities_facts = {}
fact_counts = {}
for doc in corpus:
    for rel in doc["facts"]:
        documents.append(Document(text=rel["fact"]))
        for ent in rel["entities"]:
            if ent.lower() not in entities_facts:
                entities_facts[ent.lower()] = []
            entities_facts[ent.lower()] += [rel["fact"]]
            if rel["fact"] not in fact_counts:
                fact_counts[rel["fact"]] = 0
            fact_counts[rel["fact"]] += 1
new_docs = set()
for ent in entities_facts:
    if len(entities_facts[ent]) == 1 and fact_counts[entities_facts[ent][0]] > 1:
        continue
    new_docs.add("\n".join(entities_facts[ent]))
higher_level_facts = []
for doc in new_docs:
     higher_level_facts.append(Document(text=doc))

documents = SimpleDirectoryReader(input_files=all_file).load_data()
retriever = RaptorRetriever(documents, higher_level_facts=higher_level_facts, embed_model=OpenAIEmbedding(model="text-embedding-3-small"), llm=OpenAI(model="gpt-4o", temperature=0), similarity_top_k=20, mode="collapsed", verbose = True)
query_engine = RetrieverQueryEngine.from_args(retriever, llm=OpenAI(model="gpt-4o", temperature=0))

In [None]:
import time

with open('hotpotqa.json') as file:
	data = file.read()
	lines2 = json.loads(data)

execution_time = 0
total_eval = []
for value in lines2:
    final_question = value['question'] + " Answer this question in as fewer number of words as possible."
    start_time = time.time()
    response = query_engine.query(final_question)
    end_time = time.time()
    execution_time = execution_time + (end_time - start_time)
    element = {"q": value['question'], "a": value['answer']}
    element["predict"] = str(response).strip()
    total_eval.append(element)
    print("Finished a file!")

In [3]:
with open('output/output_hotpot_SiReRAG_gpt4o_temp0.json', 'w') as file:
    file.write(json.dumps(total_eval))