In [1]:
import os
import sys

sys.path.insert(0, './compiled_protobufs')

from index_builder.pyserini_index_builder import PyseriniIndexBuilder
from index_builder.abstract_index_builder import AbstractIndexBuilder
from index_builder.marqo_index_builder import MarqoIndexBuilder


In [2]:
dataset_names = ["seriouseats", "wikihow"]
dataset_paths = [os.path.join(os.getcwd(), "bin", dataset,"taskmap") for dataset in dataset_names]


output_temp_dir = os.path.join(os.getcwd(), "temp", "system_index")
output_temp_dir_dense = os.path.join(os.getcwd(), "temp", "system_index_dense")
output_index_dir = os.path.join(os.getcwd(), "indexes", "system_index")
output_index_dir_dense = os.path.join(os.getcwd(), "indexes", "system_index_dense")

In [3]:
PyseriniBuilder = PyseriniIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

    PyseriniBuilder.build_json_docs_dense(input_dir=taskmap_dir,
                                        output_dir=output_temp_dir_dense,
                                        dataset_name=dataset_name) 

In [4]:
# Generate index.
PyseriniBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)
# Generate Dense index.
PyseriniBuilder.build_index_dense(input_dir=output_temp_dir_dense,
                                output_dir=output_index_dir_dense)

pyserini.index is deprecated, please use pyserini.index.lucene.
2022-11-16 21:42:10,270 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-11-16 21:42:10,277 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-11-16 21:42:10,282 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: /home/philip/task-search-quality/research/temp/system_index
2022-11-16 21:42:10,282 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-11-16 21:42:10,283 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-11-16 21:42:10,284 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 8
2022-11-16 21:42:10,284 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-11-16 21:42:10,285 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Ste

65it [00:00, 128781.18it/s]
134it [00:00, 152810.42it/s]
100%|██████████| 13/13 [00:07<00:00,  1.70it/s]


In [4]:
MarqoBuilder = MarqoIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    MarqoBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

MarqoBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

In [None]:
MarqoBuilder.query_index("I want pasta.")

In [5]:
from pyserini.search.lucene import LuceneSearcher    
import json

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
searcher = LuceneSearcher(index_dir=output_index_dir)

last_utterance = "I want pasta."
top_k = 5

hits = searcher.search(q=last_utterance, k=top_k)

docs = []
for hit in hits:
    doc = searcher.doc(docid=hit.docid)
    docs.append(doc.raw())

for doc_string in docs:
    doc_json = json.loads(doc_string)
    taskmap_json = doc_json['recipe_document_json']

print(docs[0])

{
  "id" : "cooking+seriouseats+398b23d824dea22cdfa4dea584788ac2",
  "contents" : "The Best Pesto alla Genovese (Classic Basil Pesto Sauce) Recipe. 2 medium cloves garlic 2 tablespoons (30g) pine nuts 3 ounces basil leaves (from about a 4-ounce bunch), washed with water still clinging to the leaves Coarse sea salt, as needed 3/4 ounce (about 2 tablespoons) grated Parmigiano Reggiano 3/4 ounce (about 2 tablespoons) Pecorino Fiore Sardo (see note) 3/4 cup (175ml) mildly flavored extra-virgin olive oil Marble mortar and wooden pestle [Italian, Pesto, Vegetarian Mains, Basil, Nuts & Seeds] After many rounds of testing, we found this method and ratio of ingredients produces the absolute best classic pesto sauce.Why It Works  Using a marble mortar with a wooden pestle creates a luxurious sauce with a rich, deep flavor and a beautiful, silky texture that's superior to what a food processor can do.Pecorino Fiore Sardo is a slightly milder sheep's-milk cheese than Pecorino Romano, and it create