In [None]:
import os
import sys

sys.path.insert(0, './compiled_protobufs')

In [None]:
dataset_names = ["seriouseats", "wikihow"]
dataset_paths = [os.path.join(os.getcwd(), "bin", dataset,"taskmap") for dataset in dataset_names]

## Loading datasets

In [None]:
## Load queries
import pandas as pd 
from sklearn.model_selection import train_test_split
    
# import queries 
cooking_queries = pd.read_csv('dataset/queries/cooking_queries.csv') 
diy_queries = pd.read_csv('dataset/queries/diy_queries.csv') 
print("Cooking and DIY queries loaded.")

# split train/test/validation - 80/10/10
q_cooking_train, q_rem = train_test_split(cooking_queries, test_size=0.2, random_state=2022)
q_cooking_test, q_cooking_validation = train_test_split(q_rem, test_size=0.5, random_state=2022)
print(f"Cooking queries: training set size {len(q_cooking_train)}, test set size {len(q_cooking_test)}, validation set size {len(q_cooking_validation)}")

q_diy_train, q_rem = train_test_split(diy_queries, test_size=0.2, random_state=2022)
q_diy_test, q_diy_validation = train_test_split(q_rem, test_size=0.5, random_state=2022)
print(f"DIY queries: training set size {len(q_diy_train)}, test set size {len(q_diy_test)}, validation set size {len(q_diy_validation)}")


## Marqo Index Builder

In [None]:
from index_builder.marqo_index_builder import MarqoIndexBuilder

In [None]:
MarqoBuilder = MarqoIndexBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    MarqoBuilder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)
    
MarqoBuilder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

##### Query index example

In [None]:
results = MarqoBuilder.query_index("I want pasta")
import pprint
pprint.pprint(results)

##### Filter usage example

In [None]:
results = MarqoBuilder.query_index_filter('I want pasta.', 'Domain:(wikihow)')
import pprint
pprint.pprint(results)

### Relevance judgements

In [None]:
import pandas as pd
import jsonlines

In [None]:
queries = [
    "I want pizza pepperoni.",
    "I would like to make spaghetti bolognese.",
    "I want to prepare smoked salmon."
]     

In [None]:
## Get rank and score from the marqo index and save these in a run file

run = []
for queryid, query in enumerate(queries):
    results = MarqoBuilder.query_index(query)
    for rank, doc in enumerate(results["hits"]):
        d = {}
        d["query_id"] = f'query-{queryid}'
        d["doc_id"] = doc["_id"]
        d["score"] = doc["_score"]
        d["rank"] = rank + 1
        run.append(d)

# jsonlines.Writer(open('qrels/run.jsonl', 'w')).write_all(run)

with open("qrels/run.run", "w") as f:
    lines = []
    for line in run:
        lines.append(f'{line["query_id"]} Q0 {line["doc_id"]} {line["rank"]} {line["score"]} t5-maxp\n')
    lines[-1] = lines[-1].replace("\n","")
    f.writelines(lines)

In [None]:
# qrel_reader = jsonlines.Reader(open("qrels/qrels.jsonl", "r"))
# qrels = pd.DataFrame([line for line in qrel_reader])
# qrels.head()

#### IR MEASURES
- nDCG - normalized Discounted Cumulative Gain (nDCG) - highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically
- precision - fraction of the documents retrieved that are relevant to the user's information need
- recall - fraction of the documents that are relevant to the query that are successfully retrieved


In [None]:
import ir_measures
from ir_measures import *

qrels = ir_measures.read_trec_qrels('qrels/qrels.qrles')
run = ir_measures.read_trec_run('qrels/run.run')

ir_measures.calc_aggregate([nDCG@3, Precision@3, Recall@3], qrels, run)

## Pyserini Index Builder

### BM25

In [None]:
from index_builder.pyserini_bm25_builder import PyseriniBM25Builder
output_temp_dir = os.path.join(os.getcwd(), "temp", "system_index_sparse")
output_index_dir = os.path.join(os.getcwd(), "indexes", "system_index_sparse")

In [None]:
PyseriniBM25Builder = PyseriniBM25Builder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniBM25Builder.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir,
                                    dataset_name=dataset_name)

In [None]:
# Generate index.
PyseriniBM25Builder.build_index(input_dir=output_temp_dir,
                                    output_dir=output_index_dir)

In [None]:
from pyserini.search.lucene import LuceneSearcher    
import json

searcher = LuceneSearcher(index_dir=output_index_dir)

last_utterance = "pasta"
top_k = 5

hits = searcher.search(q=last_utterance, k=top_k)

docs = []
docs_id = []
docs_score = []
for hit in hits:
    doc_id, doc_score = hit.docid, hit.score
    doc = searcher.doc(docid=hit.docid)
    docs.append(doc.raw())
    print(doc_id, doc_score)

for doc_string in docs[0:1]:
    doc_json = json.loads(doc_string)
    taskmap_json = doc_json['recipe_document_json']
    # print(taskmap_json)


### Ance Encoding

In [None]:
from index_builder.pyserini_ance_builder import PyseriniAnceBuilder

output_temp_dir_ance = os.path.join(os.getcwd(), "temp", "system_index_ance")
output_index_dir_ance = os.path.join(os.getcwd(), "indexes", "system_index_ance")

In [None]:
PyseriniAnce = PyseriniAnceBuilder()
for taskmap_dir, dataset_name in zip(dataset_paths, dataset_names):
    PyseriniAnce.build_json_docs(input_dir=taskmap_dir,
                                    output_dir=output_temp_dir_ance,
                                    dataset_name=dataset_name)

In [None]:
# Generate index.
PyseriniAnce.build_index(input_dir=output_temp_dir_ance,
                            output_dir=output_index_dir_ance)

In [None]:
from pyserini.search.faiss import FaissSearcher, AnceQueryEncoder
encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
searcher = FaissSearcher(
    index_dir = output_index_dir_ance,
    query_encoder= encoder,
)

In [None]:
hits = searcher.search("pasta")

for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid} {hits[i].score}' )

### BM25 + MonoT5

In [1]:
sys.path.insert(0, './pygaggle')
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5

reranker =  MonoT5()

2022-12-05 15:38:21 [INFO] loader: Loading faiss with AVX2 support.
2022-12-05 15:38:21 [INFO] loader: Successfully loaded faiss with AVX2 support.
2022-12-05 15:38:23.925567: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-12-05 15:38:26.733245: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-05 15:38:26.733322: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-05 15:38:26.733359: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (instance-alexa): /proc/driver/nvidia/version does not exist


In [2]:
query = Query('pasta')