In [2]:
import os
import sys

sys.path.insert(0, './compiled_protobufs')

In [3]:
dataset_names = ["recipe1mln"]
dataset_paths = [os.path.join(os.getcwd(), "bin", dataset, "taskmap") for dataset in dataset_names]

## Loading queries

In [9]:
## Load queries
import pandas as pd 
from sklearn.model_selection import train_test_split
    
# import queries 
cooking_queries = pd.read_csv('datasets/queries/cooking_queries.csv').iloc[:10]
print("Cooking and DIY queries loaded.")
# # split train/test/validation - 80/10/10
# q_cooking_train, q_rem = train_test_split(cooking_queries, test_size=0.2, random_state=2022)
# q_cooking_test, q_cooking_validation = train_test_split(q_rem, test_size=0.5, random_state=2022)
# print(f"Cooking queries: training set size {len(q_cooking_train)}, test set size {len(q_cooking_test)}, validation set size {len(q_cooking_validation)}")


Cooking and DIY queries loaded.


## Marqo Index Builder

In [None]:
from index_builder.marqo_index_builder import MarqoIndexBuilder

In [None]:
MarqoBuilder = MarqoIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    MarqoBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)
    
MarqoBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

##### Query index example

In [None]:
results = MarqoBuilder.query_index("I want pasta")
import pprint
pprint.pprint(results)

##### Filter usage example

In [None]:
results = MarqoBuilder.query_index_filter('I want pasta.', 'Domain:(wikihow)')
import pprint
pprint.pprint(results)

### Relevance judgements

In [None]:
import pandas as pd
import jsonlines

In [None]:
queries = [
    "I want pizza pepperoni.",
    "I would like to make spaghetti bolognese.",
    "I want to prepare smoked salmon."
]     

In [None]:
## Get rank and score from the marqo index and save these in a run file

run = []
for queryid, query in enumerate(queries):
    results = MarqoBuilder.query_index(query)
    for rank, doc in enumerate(results["hits"]):
        d = {}
        d["query_id"] = f'query-{queryid}'
        d["doc_id"] = doc["_id"]
        d["score"] = doc["_score"]
        d["rank"] = rank + 1
        run.append(d)

# jsonlines.Writer(open('qrels/run.jsonl', 'w')).write_all(run)

with open("qrels/run.run", "w") as f:
    lines = []
    for line in run:
        lines.append(f'{line["query_id"]} Q0 {line["doc_id"]} {line["rank"]} {line["score"]} t5-maxp\n')
    lines[-1] = lines[-1].replace("\n","")
    f.writelines(lines)

In [None]:
# qrel_reader = jsonlines.Reader(open("qrels/qrels.jsonl", "r"))
# qrels = pd.DataFrame([line for line in qrel_reader])
# qrels.head()

#### IR MEASURES
- nDCG - normalized Discounted Cumulative Gain (nDCG) - highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically
- precision - fraction of the documents retrieved that are relevant to the user's information need
- recall - fraction of the documents that are relevant to the query that are successfully retrieved


In [None]:
import ir_measures
from ir_measures import *

qrels = ir_measures.read_trec_qrels('qrels/qrels.qrles')
run = ir_measures.read_trec_run('qrels/run.run')

ir_measures.calc_aggregate([nDCG@3, Precision@3, Recall@3], qrels, run)

## Pyserini Index Builder

### BM25

#### Recipees

In [4]:
from index_builder.pyserini_bm25_builder import PyseriniBM25Builder
output_temp_dir = os.path.join(os.getcwd(), "temp", "food", "system_index_sparse")
output_index_dir = os.path.join(os.getcwd(), "indexes", "food", "system_index_sparse")

In [5]:
PyseriniBM25Builder = PyseriniBM25Builder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniBM25Builder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

: 

: 

In [None]:
# Generate index.
PyseriniBM25Builder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

pyserini.index is deprecated, please use pyserini.index.lucene.
2022-12-13 16:35:07,756 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-12-13 16:35:07,759 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-12-13 16:35:07,759 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: /home/philip/task-search-quality/temp/food/system_index_sparse
2022-12-13 16:35:07,760 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-12-13 16:35:07,760 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-12-13 16:35:07,761 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 8
2022-12-13 16:35:07,761 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-12-13 16:35:07,762 INFO  [main] index.IndexCollection (IndexCollection.java:653) - 

In [14]:
from pyserini.search.lucene import LuceneSearcher    
import json

searcher = LuceneSearcher(index_dir=output_index_dir)

last_utterance = "pasta"
top_k = 5

hits = searcher.search(q=last_utterance, k=top_k)

docs = []
docs_id = []
docs_score = []
for hit in hits:
    print(hit)
    doc_id, doc_score = hit.docid, hit.score
    doc = searcher.doc(docid=hit.docid)
    docs.append(doc.raw())
    print(doc_id, doc_score)

for doc_string in docs[0:1]:
    doc_json = json.loads(doc_string)
    taskmap_json = doc_json['recipe_document_json']
    # print(taskmap_json)


<io.anserini.search.SimpleSearcher$Result at 0x7f04f2138d10 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x1ff4a130 at 0x7f06311d9910>>
cooking+recipe1m+a79f84d3a1c9e6f79fbb85a6caa69229 4.2845001220703125
<io.anserini.search.SimpleSearcher$Result at 0x7f04fdd098b0 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x1ff4a148 at 0x7f04fdd4a690>>
cooking+recipe1m+a79f84d3a1c9e6f79fbb85a6caa69229 4.284499168395996
<io.anserini.search.SimpleSearcher$Result at 0x7f04fdd09810 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x1ff4a150 at 0x7f04fdd4aa70>>
cooking+recipe1m+07e4c1ff548fdc39d1f5a0810d18624d 4.256899833679199
<io.anserini.search.SimpleSearcher$Result at 0x7f04fdd09860 jclass=io/anserini/search/SimpleSearcher$Result jself=<LocalRef obj=0x1ff4a158 at 0x7f04fdd4aab0>>
cooking+recipe1m+07e4c1ff548fdc39d1f5a0810d18624d 4.256898880004883
<io.anserini.search.SimpleSearcher$Result at 0x7f04fdd09900 jclass=io/anserini/search/Simpl

### Ance Encoding

In [None]:
from index_builder.pyserini_ance_builder import PyseriniAnceBuilder

output_temp_dir_ance = os.path.join(os.getcwd(), "temp", "food", "system_index_ance")
output_index_dir_ance = os.path.join(os.getcwd(), "indexes", "food", "system_index_ance")

In [None]:
PyseriniAnce = PyseriniAnceBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniAnce.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir_ance,
                                    dataset_name=dataset_name)

In [None]:
# Generate index.
PyseriniAnce.build_index(input_dir=output_temp_dir_ance,
                            output_dir=output_index_dir_ance)

In [None]:
from pyserini.search.faiss import FaissSearcher, AnceQueryEncoder
encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
searcher = FaissSearcher(
    index_dir = output_index_dir_ance,
    query_encoder= encoder,
)

In [None]:
hits = searcher.search("pasta")

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid} {hits[i].score}' )

### BM25 + MonoT5

In [1]:
sys.path.insert(0, './pygaggle')
from pygaggle.rerank.base import Query, Text, hits_to_texts
from pygaggle.rerank.transformer import MonoT5

reranker =  MonoT5()

2022-12-14 22:32:27 [INFO] loader: Loading faiss with AVX2 support.
2022-12-14 22:32:27 [INFO] loader: Successfully loaded faiss with AVX2 support.
2022-12-14 22:32:30.261209: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-12-14 22:32:33.468323: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-14 22:32:33.468453: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-14 22:32:33.468530: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (instance-alexa): /proc/driver/nvidia/version does not exist


In [2]:
from pyserini.search.lucene import LuceneSearcher
output_temp_dir = os.path.join(os.getcwd(), "temp", "system_index_sparse")
output_index_dir = os.path.join(os.getcwd(), "indexes", "food", "system_index_sparse")

query = Query('pasta')
searcher = LuceneSearcher(index_dir=output_index_dir)
hits = searcher.search(query.text)
texts = hits_to_texts(hits)

In [3]:
for i in range(0, 10):
    print(f'{i+1:2} {texts[i].metadata["docid"]:15} {texts[i].score:.5f} {texts[i].text}')


 1 cooking+recipe1m+e913e7d20377b736080a0e7df5f7a37a 2.86270 {
  "id" : "cooking+recipe1m+e913e7d20377b736080a0e7df5f7a37a",
  "contents" : "Girls Night Pasta. 16 ounces, weight Rotini Pasta, Cooked 1 whole Roasted Chicken, Picked 1 cup Sun-Dried Tomatoes, Diced 1/2 cups Grated Parmesan Cheese 2 ounces, weight Pine Nuts, Toasted 1 cup Pesto Sauce (or To Taste) 1 cup Reserved Pasta Water (add To Taste) 2 dashes Salt And Pepper, to taste Start off by boiling some salted pasta water, and cook the pasta until almost al dente. Reserve 1 cup of pasta water, then drain the pasta and set it aside (the pasta will finish cooking when you stir in the other ingredients). Pick the chicken, dice the sundried tomatoes, and grate the Parmesan. Lightly toast the pine nuts in a skillet, watching carefully so that they dont burn. Over low heat, stir the pesto, tomatoes, chicken, and pine nuts into the pasta until incorporated and the pasta is al dente. If the pasta looks dry, add in a splash of pasta wat

In [4]:
reranked = reranker.rerank(query, texts)
for i in range(0, 10):
    print(f'{i+1:2} {reranked[i].metadata["docid"]:15} {reranked[i].score:.5f} {reranked[i].text}')

 1 cooking+recipe1m+0e53f5660777a767609d56ff2694bb5c -0.58902 {
  "id" : "cooking+recipe1m+0e53f5660777a767609d56ff2694bb5c",
  "contents" : "Pasta with Spinach and Sausage. 1/2 pounds Gluten-free Pasta (I Used Macaroni) 1/2 pounds Wheat Pasta 1 Tablespoon Olive Oil 1 whole Onion, Minced 3 cloves Garlic, Thinly Sliced 1 pound Ground Sausage 3 cups Spinach, Chopped 1/4 cups Asiago Cheese, Shredded Fill 2 pots with water and bring to a boil. When water is boiling add the gluten-free pasta to one pot and the wheat pasta to the other. Cook each pasta according to the time listed on the package. If/when you stir the pasta, be sure to use a separate spoon for each pot. Drain pasta when it has finished cooking (I drain the gluten-free pasta in a colander and the wheat pasta using the lid of the pan it was cooked in to avoid contamination) and return each to their respective pot. While the pasta is cooking, heat the olive oil in a large skillet over medium heat. Add the onion and garlic and sa