In [1]:
import os
import sys

sys.path.insert(0, './compiled_protobufs')

from index_builder.pyserini_index_builder import PyseriniIndexBuilder
from index_builder.abstract_index_builder import AbstractIndexBuilder
from index_builder.marqo_index_builder import MarqoIndexBuilder

ModuleNotFoundError: No module named 'marqo'

In [None]:
dataset_names = ["seriouseats", "wikihow"]
dataset_paths = [os.path.join(os.getcwd(), "bin", dataset,"taskmap") for dataset in dataset_names]


output_temp_dir = os.path.join(os.getcwd(), "temp", "system_index")
output_temp_dir_dense = os.path.join(os.getcwd(), "temp", "system_index_dense")
output_index_dir = os.path.join(os.getcwd(), "indexes", "system_index")
output_index_dir_dense = os.path.join(os.getcwd(), "indexes", "system_index_dense")

## Marqo Index Builder

In [3]:
MarqoBuilder = MarqoIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    MarqoBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

MarqoBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

taskmap_id: "cooking+seriouseats+c33dacc39c50efcf14c696f33d374888"
title: "Simple Vinaigrette Recipe"
source_url: "https://www.seriouseats.com/simple-vinaigrette"
description: "Note: Why bother emulsifying your vinaigrette? Read about the science of emulsions.    This recipes make a cup of vinaigrette. Extra vinaigrette can be stored in a sealed container in the refrigerator indefinitely.    How to Make Vinaigrette (and Dress Your Salad Right)"
thumbnail_url: "https://www.seriouseats.com/thmb/fuROSgGh_pAfw9KKjHU-z6m0C3c=/880x0/filters:no_upscale():max_bytes(150000):strip_icc()/__opt__aboutcom__coeus__resources__content_migration__serious_eats__seriouseats.com__images__2014__10__20140930-how-to-dress-a-salad-vinaigrette-17-d1bde50a64a34b4681921de59226822b.jpg"
tags: "[Salad Dressing]"
requirement_list {
  unique_id: "197d7db0-d30a-46a1-8fda-fb102a6fe340"
  type: HARDWARE
  name: "3 tablespoons white wine vinegar or lemon juice"
}
requirement_list {
  unique_id: "69726cb8-7261-479b-a77d-

##### Query index example

In [4]:
results = MarqoBuilder.query_index("I want pasta")
import pprint
pprint.pprint(results)

{'hits': [{'Date': 'March 29, 2019',
           'Description': 'A sure fire Mexican themed main dish that is both '
                          'easy to make and budget friendly. A real tried and '
                          'true recipe.',
           'Difficulty': '',
           'Domain': 'wikihow',
           'Requirements': '',
           'Steps': 'Wash and dry the green onions. Chop them including some '
                    'of the green tops. Shred the cheese.  Fry tortillas in '
                    'oil and dip in heated La Victoria Enchilada sauce.  Spray '
                    'a small baking dish with nonstick cooking spray.  Preheat '
                    'oven to 350 °F (177 °C).  Add a corn tortilla to the '
                    'dish.  Top with about 2 tablespoons (30 ml) shredded '
                    'Cheddar cheese.  Top with 2 tablespoons (29.6 ml) chopped '
                    'green onion.  Spoon a bit of tortilla sauce over the '
                    'cheese and green onio

##### Filter usage example

In [10]:
results = MarqoBuilder.query_index_filter('I want pasta.', 'Domain:(wikihow)')
import pprint
pprint.pprint(results)

{'hits': [{'Date': 'February 11, 2017',
           'Description': 'Have you ever tried poppy seed muffins before? How '
                          'about poppy seed pancakes? How about lemon poppy '
                          'seed pancakes? Poppy seeds have a beautiful earthy '
                          'taste, and give recipes a nice crunch. Lemon is '
                          'sour and sweet, therefore the perfect combination '
                          'for pancakes. This article will teach you exactly '
                          'how to make these special kind of pancakes.',
           'Difficulty': '',
           'Domain': 'wikihow',
           'Requirements': '',
           'Steps': 'In a bowl, mix the sugar and lemon zest between your '
                    'fingers. Pour in all the dry ingredients in another bowl. '
                    'Whisk until all the ingredients are combined well. Whisk '
                    'up all the wet ingredients in a separate bowl. Pour the '
      

### Relevance judgements

In [4]:
import pandas as pd
import jsonlines

In [5]:
queries = [
    "I want pizza pepperoni.",
    "I would like to make spaghetti bolognese.",
    "I want to prepare smoked salmon."
]     

In [13]:
## Get rank and score from the marqo index and save these in a run file

run = []
for queryid, query in enumerate(queries):
    results = MarqoBuilder.query_index(query)
    for rank, doc in enumerate(results["hits"]):
        d = {}
        d["query_id"] = f'query-{queryid}'
        d["doc_id"] = doc["_id"]
        d["score"] = doc["_score"]
        d["rank"] = rank + 1
        run.append(d)

# jsonlines.Writer(open('qrels/run.jsonl', 'w')).write_all(run)

with open("qrels/run.run", "w") as f:
    lines = []
    for line in run:
        lines.append(f'{line["query_id"]} Q0 {line["doc_id"]} {line["rank"]} {line["score"]} t5-maxp\n')
    lines[-1] = lines[-1].replace("\n","")
    f.writelines(lines)

In [25]:
# qrel_reader = jsonlines.Reader(open("qrels/qrels.jsonl", "r"))
# qrels = pd.DataFrame([line for line in qrel_reader])
# qrels.head()

#### IR MEASURES
- nDCG - normalized Discounted Cumulative Gain (nDCG) - highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically
- precision - fraction of the documents retrieved that are relevant to the user's information need
- recall - fraction of the documents that are relevant to the query that are successfully retrieved


In [14]:
import ir_measures
from ir_measures import *

qrels = ir_measures.read_trec_qrels('qrels/qrels.qrles')
run = ir_measures.read_trec_run('qrels/run.run')

ir_measures.calc_aggregate([nDCG@3, Precision@3, Recall@3], qrels, run)

{R@3: 0.3333333333333333, P@3: 1.0, nDCG@3: 0.9596957971028499}

## Pyserini Index Builder

In [5]:
PyseriniBuilder = PyseriniIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

    PyseriniBuilder.build_json_docs_dense(input_dir=taskmap_dir,
                                        output_dir=output_temp_dir_dense,
                                        dataset_name=dataset_name) 

In [6]:
# Generate index.
PyseriniBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)
# Generate Dense index.
PyseriniBuilder.build_index_dense(input_dir=output_temp_dir_dense,
                                output_dir=output_index_dir_dense)

pyserini.index is deprecated, please use pyserini.index.lucene.
2022-11-27 13:23:46,124 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-11-27 13:23:46,127 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-11-27 13:23:46,128 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: /home/philip/task-search-quality/temp/system_index
2022-11-27 13:23:46,128 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-11-27 13:23:46,129 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-11-27 13:23:46,129 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 8
2022-11-27 13:23:46,130 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-11-27 13:23:46,130 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: por

65it [00:00, 159806.42it/s]
134it [00:00, 196998.51it/s]
100%|██████████| 13/13 [00:11<00:00,  1.14it/s]


In [7]:
from pyserini.search.lucene import LuceneSearcher    
import json

In [17]:
searcher = LuceneSearcher(index_dir=output_index_dir)

last_utterance = "I want pasta."
top_k = 5

hits = searcher.search(q=last_utterance, k=top_k)

docs = []
for hit in hits:
    doc = searcher.doc(docid=hit.docid)
    docs.append(doc.raw())

for doc_string in docs:
    doc_json = json.loads(doc_string)
    taskmap_json = doc_json['recipe_document_json']


In [19]:
from pyserini.search.faiss import FaissSearcher, AnceQueryEncoder
encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
searcher = FaissSearcher(
    index_dir = output_index_dir_dense,
    query_encoder= encoder,
)

In [16]:
hits = searcher.search("I want pasta.")

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid} {hits[i].score}' )

 1 cooking+seriouseats+398b23d824dea22cdfa4dea584788ac2 4.120800018310547
 2 cooking+seriouseats+e6aba43fd7655fb4f5ef44b7aab7adab 3.06469988822937
 3 cooking+seriouseats+09e3025eb791d48aa4eff0bd7bc283ed 2.4458999633789062
 4 cooking+seriouseats+a4c6b3c4cfc3106d7289795f76aa65fe 2.3598999977111816
 5 cooking+seriouseats+be38735f0bc3e16eaf61620d50928267 2.2829999923706055
 6 cooking+seriouseats+18ea3af098ac7f360159d38caa0f528c 2.2451000213623047
 7 cooking+seriouseats+6dafcfb63b8a0b72388d6d56f06db6f4 2.185499906539917
 8 cooking+seriouseats+546c3b01b1713fbb47d70c64dba6846e 2.174799919128418
 9 cooking+seriouseats+64eacb0d6b4f7491df14ccdf58068b2b 2.1695001125335693
10 cooking+seriouseats+28eb3f92ac77917dd5f3b8ccbdc966c5 2.153599977493286
