In [1]:
import os
import sys

sys.path.insert(0, './compiled_protobufs')

from index_builder.pyserini_index_builder import PyseriniIndexBuilder
from index_builder.abstract_index_builder import AbstractIndexBuilder
from index_builder.marqo_index_builder import MarqoIndexBuilder


In [2]:
dataset_names = ["seriouseats", "wikihow"]
dataset_paths = [os.path.join(os.getcwd(), "bin", dataset,"taskmap") for dataset in dataset_names]


output_temp_dir = os.path.join(os.getcwd(), "temp", "system_index")
output_temp_dir_dense = os.path.join(os.getcwd(), "temp", "system_index_dense")
output_index_dir = os.path.join(os.getcwd(), "indexes", "system_index")
output_index_dir_dense = os.path.join(os.getcwd(), "indexes", "system_index_dense")

## Marqo Index Builder

In [3]:
MarqoBuilder = MarqoIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    MarqoBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

MarqoBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

{'_id': 'diy+wikihow-offline+5448d4e195f69b73fc30412f611480a2', 'Title': 'How to Stick a Pencil to the Ceiling', 'Description': 'How to Stick a Pencil to the Ceiling. Pencil School glue stick Arts and Entertainment Fun Activities Tricks Tricks with Objects Pencil Tricks Ever seen a pencil hanging from the ceiling of a classroom? Ever wondered how it got there? Maybe you want to impress your friends with this simple, but fun trick.Stab the flat end of the pencil (or end with eraser) into a school glue stick, about 1–2 centimeter (0.4–0.8 in) in (up to three quarters of an inch) Take it out and see if there is a "glob" of glue on the end. Move the glue onto the end of the pencil, make sure it is on tight and won\'t come off too easily. Holding the sharpened end with the glue downwards, (preferably just below the height of a desk) check for a teacher, the best time to do this is when a teacher has left the room. Fling the pencil up. Just a quick flick will do. Look for it moving on the ce

### Relevance judgements

In [4]:
import pandas as pd
import jsonlines

In [5]:
queries = [
    "I want pizza pepperoni.",
    "I would like to make spaghetti bolognese.",
    "I want to prepare smoked salmon."
]     

In [6]:
## Query index example

results = MarqoBuilder.query_index("I want pasta")
import pprint
pprint.pprint(results)

{'hits': [{'Description': 'How to Make Stacked Cheese Enchiladas. Food and '
                          'Entertaining Recipes World Cuisines Central and '
                          'South American Cuisine Enchiladas A sure fire '
                          'Mexican themed main dish that is both easy to make '
                          'and budget friendly. A real tried and true '
                          'recipe.Wash and dry the green onions. Chop them '
                          'including some of the green tops. Shred the '
                          'cheese.  Fry tortillas in oil and dip in heated La '
                          'Victoria Enchilada sauce.  Spray a small baking '
                          'dish with nonstick cooking spray.  Preheat oven to '
                          '350 °F (177 °C).  Add a corn tortilla to the dish.  '
                          'Top with about 2 tablespoons (30 ml) shredded '
                          'Cheddar cheese.  Top with 2 tablespoons (29.6 ml)

In [9]:
## Get rank and score from the marqo index and save these in a run file

run = []
for queryid, query in enumerate(queries):
    results = MarqoBuilder.query_index(query)
    for rank, doc in enumerate(results["hits"]):
        d = {}
        d["query_id"] = f'query-{queryid}'
        d["doc_id"] = doc["_id"]
        d["score"] = doc["_score"]
        d["rank"] = rank + 1
        run.append(d)

# jsonlines.Writer(open('qrels/run.jsonl', 'w')).write_all(run)

with open("qrels/run.run", "w") as f:
    lines = []
    for line in run:
        lines.append(f'{line["query_id"]} Q0 {line["doc_id"]} {line["rank"]} {line["score"]} t5-maxp\n')
    lines[-1] = lines[-1].replace("\n","")
    f.writelines(lines)

In [25]:
# qrel_reader = jsonlines.Reader(open("qrels/qrels.jsonl", "r"))
# qrels = pd.DataFrame([line for line in qrel_reader])
# qrels.head()

#### IR MEASURES
- nDCG - normalized Discounted Cumulative Gain (nDCG) - highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically
- precision - fraction of the documents retrieved that are relevant to the user's information need
- recall - fraction of the documents that are relevant to the query that are successfully retrieved


In [21]:
import ir_measures
from ir_measures import *

qrels = ir_measures.read_trec_qrels('qrels/qrels.qrles')
run = ir_measures.read_trec_run('qrels/run.run')

ir_measures.calc_aggregate([nDCG@3, Precision@3, Recall@3], qrels, run)

AssertionError: unsupported params found: ['cutoff']

## Pyserini Index Builder

In [10]:
PyseriniBuilder = PyseriniIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

    PyseriniBuilder.build_json_docs_dense(input_dir=taskmap_dir,
                                        output_dir=output_temp_dir_dense,
                                        dataset_name=dataset_name) 

In [11]:
# Generate index.
PyseriniBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)
# Generate Dense index.
PyseriniBuilder.build_index_dense(input_dir=output_temp_dir_dense,
                                output_dir=output_index_dir_dense)

pyserini.index is deprecated, please use pyserini.index.lucene.
2022-11-19 11:39:14,767 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-11-19 11:39:14,770 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-11-19 11:39:14,772 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: /home/philip/task-search-quality/temp/system_index
2022-11-19 11:39:14,772 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-11-19 11:39:14,773 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-11-19 11:39:14,773 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 8
2022-11-19 11:39:14,774 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-11-19 11:39:14,774 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: por

65it [00:00, 128175.72it/s]
134it [00:00, 181068.54it/s]
100%|██████████| 13/13 [00:07<00:00,  1.73it/s]


In [12]:
from pyserini.search.lucene import LuceneSearcher    
import json

In [13]:
searcher = LuceneSearcher(index_dir=output_index_dir)

last_utterance = "I want pasta."
top_k = 5

hits = searcher.search(q=last_utterance, k=top_k)

docs = []
for hit in hits:
    doc = searcher.doc(docid=hit.docid)
    docs.append(doc.raw())

for doc_string in docs:
    doc_json = json.loads(doc_string)
    taskmap_json = doc_json['recipe_document_json']

print(docs[0])

{
  "id" : "cooking+seriouseats+398b23d824dea22cdfa4dea584788ac2",
  "contents" : "The Best Pesto alla Genovese (Classic Basil Pesto Sauce) Recipe. 2 medium cloves garlic 2 tablespoons (30g) pine nuts 3 ounces basil leaves (from about a 4-ounce bunch), washed with water still clinging to the leaves Coarse sea salt, as needed 3/4 ounce (about 2 tablespoons) grated Parmigiano Reggiano 3/4 ounce (about 2 tablespoons) Pecorino Fiore Sardo (see note) 3/4 cup (175ml) mildly flavored extra-virgin olive oil Marble mortar and wooden pestle [Italian, Pesto, Vegetarian Mains, Basil, Nuts & Seeds] After many rounds of testing, we found this method and ratio of ingredients produces the absolute best classic pesto sauce.Why It Works  Using a marble mortar with a wooden pestle creates a luxurious sauce with a rich, deep flavor and a beautiful, silky texture that's superior to what a food processor can do.Pecorino Fiore Sardo is a slightly milder sheep's-milk cheese than Pecorino Romano, and it create