In [3]:
import os
import sys

sys.path.insert(0, './compiled_protobufs')

In [1]:
dataset_names = ["recipe1mln"]
dataset_paths = [os.path.join(os.getcwd(), "bin", dataset, "taskmap") for dataset in dataset_names]

## Loading queries

In [None]:
## Load queries
import pandas as pd 
from sklearn.model_selection import train_test_split
    
# import queries 
cooking_queries = pd.read_csv('dataset/queries/cooking_queries.csv') 
diy_queries = pd.read_csv('dataset/queries/diy_queries.csv') 
print("Cooking and DIY queries loaded.")

# split train/test/validation - 80/10/10
q_cooking_train, q_rem = train_test_split(cooking_queries, test_size=0.2, random_state=2022)
q_cooking_test, q_cooking_validation = train_test_split(q_rem, test_size=0.5, random_state=2022)
print(f"Cooking queries: training set size {len(q_cooking_train)}, test set size {len(q_cooking_test)}, validation set size {len(q_cooking_validation)}")

q_diy_train, q_rem = train_test_split(diy_queries, test_size=0.2, random_state=2022)
q_diy_test, q_diy_validation = train_test_split(q_rem, test_size=0.5, random_state=2022)
print(f"DIY queries: training set size {len(q_diy_train)}, test set size {len(q_diy_test)}, validation set size {len(q_diy_validation)}")


## Marqo Index Builder

In [None]:
from index_builder.marqo_index_builder import MarqoIndexBuilder

In [None]:
MarqoBuilder = MarqoIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    MarqoBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)
    
MarqoBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

##### Query index example

In [None]:
results = MarqoBuilder.query_index("I want pasta")
import pprint
pprint.pprint(results)

##### Filter usage example

In [None]:
results = MarqoBuilder.query_index_filter('I want pasta.', 'Domain:(wikihow)')
import pprint
pprint.pprint(results)

### Relevance judgements

In [None]:
import pandas as pd
import jsonlines

In [None]:
queries = [
    "I want pizza pepperoni.",
    "I would like to make spaghetti bolognese.",
    "I want to prepare smoked salmon."
]     

In [None]:
## Get rank and score from the marqo index and save these in a run file

run = []
for queryid, query in enumerate(queries):
    results = MarqoBuilder.query_index(query)
    for rank, doc in enumerate(results["hits"]):
        d = {}
        d["query_id"] = f'query-{queryid}'
        d["doc_id"] = doc["_id"]
        d["score"] = doc["_score"]
        d["rank"] = rank + 1
        run.append(d)

# jsonlines.Writer(open('qrels/run.jsonl', 'w')).write_all(run)

with open("qrels/run.run", "w") as f:
    lines = []
    for line in run:
        lines.append(f'{line["query_id"]} Q0 {line["doc_id"]} {line["rank"]} {line["score"]} t5-maxp\n')
    lines[-1] = lines[-1].replace("\n","")
    f.writelines(lines)

In [None]:
# qrel_reader = jsonlines.Reader(open("qrels/qrels.jsonl", "r"))
# qrels = pd.DataFrame([line for line in qrel_reader])
# qrels.head()

#### IR MEASURES
- nDCG - normalized Discounted Cumulative Gain (nDCG) - highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically
- precision - fraction of the documents retrieved that are relevant to the user's information need
- recall - fraction of the documents that are relevant to the query that are successfully retrieved


In [None]:
import ir_measures
from ir_measures import *

qrels = ir_measures.read_trec_qrels('qrels/qrels.qrles')
run = ir_measures.read_trec_run('qrels/run.run')

ir_measures.calc_aggregate([nDCG@3, Precision@3, Recall@3], qrels, run)

## Pyserini Index Builder

### BM25

In [4]:
from index_builder.pyserini_bm25_builder import PyseriniBM25Builder
output_temp_dir = os.path.join(os.getcwd(), "temp", "diy", "system_index_sparse")
output_index_dir = os.path.join(os.getcwd(), "indexes", "diy", "system_index_sparse")

In [5]:
PyseriniBM25Builder = PyseriniBM25Builder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniBM25Builder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

In [6]:
# Generate index.
PyseriniBM25Builder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

pyserini.index is deprecated, please use pyserini.index.lucene.
2022-12-06 22:43:16,043 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-12-06 22:43:16,046 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-12-06 22:43:16,047 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: /home/philip/task-search-quality/temp/system_index_sparse
2022-12-06 22:43:16,048 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-12-06 22:43:16,048 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-12-06 22:43:16,049 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 8
2022-12-06 22:43:16,050 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-12-06 22:43:16,050 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemm

In [7]:
from pyserini.search.lucene import LuceneSearcher    
import json

searcher = LuceneSearcher(index_dir=output_index_dir)

last_utterance = "pasta"
top_k = 5

hits = searcher.search(q=last_utterance, k=top_k)

docs = []
docs_id = []
docs_score = []
for hit in hits:
    doc_id, doc_score = hit.docid, hit.score
    doc = searcher.doc(docid=hit.docid)
    docs.append(doc.raw())
    print(doc_id, doc_score)

for doc_string in docs[0:1]:
    doc_json = json.loads(doc_string)
    taskmap_json = doc_json['recipe_document_json']
    # print(taskmap_json)


2022-12-06 22:43:42.684134: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


cooking+recipe1m+a79f84d3a1c9e6f79fbb85a6caa69229 4.268499851226807
cooking+recipe1m+07e4c1ff548fdc39d1f5a0810d18624d 4.241499900817871
cooking+recipe1m+daa02e9a1f1031f769e827579ebcb800 4.151500225067139
cooking+recipe1m+92b20bd9a21fe484a9c05a5fd30ddff1 4.116000175476074
cooking+recipe1m+315dea41870e387fc3c9467207daf1e9 4.078400135040283


### Ance Encoding

In [None]:
from index_builder.pyserini_ance_builder import PyseriniAnceBuilder

output_temp_dir_ance = os.path.join(os.getcwd(), "temp", "diy", "system_index_ance")
output_index_dir_ance = os.path.join(os.getcwd(), "indexes", "diy", "system_index_ance")

In [None]:
PyseriniAnce = PyseriniAnceBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniAnce.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir_ance,
                                    dataset_name=dataset_name)

In [None]:
# Generate index.
PyseriniAnce.build_index(input_dir=output_temp_dir_ance,
                            output_dir=output_index_dir_ance)

In [None]:
from pyserini.search.faiss import FaissSearcher, AnceQueryEncoder
encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
searcher = FaissSearcher(
    index_dir = output_index_dir_ance,
    query_encoder= encoder,
)

In [None]:
hits = searcher.search("pasta")

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid} {hits[i].score}' )

### BM25 + MonoT5

In [2]:
sys.path.insert(0, './pygaggle')
from pygaggle.rerank.base import Query, Text, hits_to_texts
from pygaggle.rerank.transformer import MonoT5

reranker =  MonoT5()

In [5]:
from pyserini.search.lucene import LuceneSearcher
output_temp_dir = os.path.join(os.getcwd(), "temp", "system_index_sparse")
output_index_dir = os.path.join(os.getcwd(), "indexes", "system_index_sparse")

query = Query('pasta')
searcher = LuceneSearcher(index_dir=output_index_dir)
hits = searcher.search(query.text)
texts = hits_to_texts(hits)

In [9]:
for i in range(0, 10):
    print(f'{i+1:2} {texts[i].metadata["docid"]:15} {texts[i].score:.5f} {texts[i].text}')


 1 cooking+seriouseats+18ea3af098ac7f360159d38caa0f528c 2.24510 {
  "id" : "cooking+seriouseats+18ea3af098ac7f360159d38caa0f528c",
  "contents" : "Parisian Gnocchi Soufflé Recipe. 2 tablepsoons unsalted butter 1 recipe Parisian Gnocchi 1/2 cup grated Parmesan cheese 1 tablespoon chopped parsley 1 tablespoon chopped fresh chives [Pasta Mains, Vegetarian Mains, Baked Pasta, Gnocchis] How to Make Parisian Gnocchi | The Food Lab              Featured VideoAdjust rack to 6 to 8 inches below broiler element and preheat broiler to high. Heat butter in a 12-inch cast iron skillet or two 10-inch cast iron skillets over medium-high heat until foaming subsides and butter starts to brown. Add gnocchi in a single layer and toss to coat with browned butter. Cover with cheese, then transfer to broiler. Cook until puffed and golden brown, about 10 minutes. Sprinkle with herbs and serve. ",
  "recipe_document_json" : {
    "taskmapId" : "cooking+seriouseats+18ea3af098ac7f360159d38caa0f528c",
    "title

In [11]:
reranked = reranker.rerank(query, texts)
for i in range(0, 10):
    print(f'{i+1:2} {reranked[i].metadata["docid"]:15} {reranked[i].score:.5f} {reranked[i].text}')

 1 cooking+seriouseats+f43cc8996bac792085c65e0700299538 -3.44174 {
  "id" : "cooking+seriouseats+f43cc8996bac792085c65e0700299538",
  "contents" : "Parisian Gnocchi Recipe. 1 cup (8 ounces) water 8 tablespoons (1 stick, 4 ounces) unsalted butter 3/4 teaspoon (about 0.15 ounces) kosher salt 1 1/4 cups (6.25 ounces) all-purpose flour 1 tablespoon Dijon mustard 1/2 cup (about 1 ounce) freshly grated Parmesan cheese 3 large eggs 2 tablespoons chopped fresh parsley leaves 2 tablespoons finely sliced chives Olive oil Stand mixer [Gnocchis, Pasta Mains] How to Make Parisian Gnocchi | The Food Lab              Featured VideoBring water, butter, and salt to a boil in a medium saucepan over high heat. Add flour all at once and stir with a wooden spoon until a smooth dough forms. Reduce heat to medium-low and continue to stir, beating dough forcefully and rapidly to prevent it from sticking to the pot. Continue cooking until dough pulls away from sides of pot leaving a thin layer and steams sligh