In [9]:
import sys
import os
import json
from google.protobuf.json_format import Parse
sys.path.insert(0, 'compiled_protobufs')
from taskmap_pb2 import TaskMap
from pyserini.search import LuceneSearcher
import csv
from pprint import pprint
import pandas as pd

In [7]:
searcher_config = {
    'BM25': {'b': 0.4, 'k1': 0.9},
    # 'BM25+RM3':{
    #     'BM25': {'b': 0.4, 'k1': 0.9},
    #     'RM3': {'fb_terms': 10, 'fb_docs': 10, 'original_query_weight': 0.5},
    # }
}

In [33]:
taskmap_cooking_index_path = os.path.join(os.getcwd(), "indexes", "food", "system_index_sparse")

def get_pyserini_docs(searcher, query, k, query_id):
    """ Print pyserini results based on specific query, searcher, and k. """
    results = []
    for hit in searcher.search(q=query, k=k):
        doc_string = hit.raw
        doc_json = json.loads(doc_string)
        taskmap_json = doc_json['recipe_document_json']
        taskmap = Parse(json.dumps(taskmap_json), TaskMap())
        result = {
            "doc-id" : taskmap.taskmap_id, 
            "doc-title" : taskmap.title, 
            "doc-url" : taskmap.source_url, 
            "score": round(float(hit.score),3),
            "query-id": query_id,
            "query": query,
            "taskgraph" : taskmap,
        }
        results.append(result)
    return results

def get_dict_keys_list(judgments, fieldnames):
    new_list = []
    for judgment in judgments:
        dict_item = {}
        for key in fieldnames:
            dict_item[key] = judgment[key]
        new_list.append(dict_item)
    return new_list
    

In [34]:
annotations_path = os.path.join(os.getcwd(), "datasets", "judgments", "cooking-annotations2.csv")
annotation_pairs = os.path.join(os.getcwd(), "datasets", "judgments", "annotation-pairs.csv")

queries_path = os.path.join(os.getcwd(), "datasets", "queries", "cooking_queries.csv")
cooking_queries = pd.read_csv(queries_path).iloc[:10]

In [35]:
searcher = LuceneSearcher(index_dir=taskmap_cooking_index_path)
searcher.set_bm25(b=0.4,k1=0.9)

new_judgments = []
for idx, query in cooking_queries.iterrows():
    new_judgments += get_pyserini_docs(searcher, query["target query"], k=5, query_id=idx)
for idx, judgement in enumerate(new_judgments):
    judgement["judgment-id"] = idx

In [37]:
# save taskgraph + query pair for analysis
with open(annotation_pairs, 'w') as csvfile:
    fieldnames = ["judgment-id", "query", "query-id", "taskgraph"]
    print(fieldnames)
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(get_dict_keys_list(new_judgments, fieldnames))

# save annotations values
with open(annotations_path, 'w') as csvfile:
    fieldnames = ['judgment-id', 'doc-id', "query-id", "relevance", "justification"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(get_dict_keys_list(new_judgments, fieldnames[:-2]))


['judgment-id', 'query', 'query-id', 'taskgraph']


In [67]:
class TurkEmptyJudgmentParser:    
    def parse_taskgraph(self, judgment):
        taskgraph = judgment["taskgraph"]
        taskgraph_steps = [step.response.screen.paragraphs[0] for step in taskgraph.steps]
        taskgraph_requirements = [requirements.name for requirements in taskgraph.requirement_list]
        taskgraph_title = taskgraph.title
        taskgraph_turk_info = {
            "taskgraph title": taskgraph_title,
            "taskgraph_requirements": taskgraph_requirements,
            "taskgraph_steps": taskgraph_steps,
        }
        return taskgraph_turk_info

    def print_taskgraph_info(self, judgment):
        turk_info = self.parse_taskgraph(judgment)
        judgment_id, query_id, query, doc_id searcher= judgment["judgment-id"], judgment["query-id"], judgment["query"], judgment["doc-id"]
        title, requirements, steps = turk_info["taskgraph title"], turk_info["taskgraph_requirements"], turk_info["taskgraph_steps"]
        print("-" * 20)
        print(f"Judgment id: {judgment_id}")
        print(f'Query id: {query_id}, Query: {query}')
        print(f'Document id: {doc_id}, Title: {title}')
        print(f'Requirements: ')
        for requirement in requirements:
            print(f'  - {requirement}')
        print(f'Steps:')
        for idx, step in enumerate(steps):
            print(f'{idx+1}. {step}')
        print("-" * 20)

In [68]:
judgment_parser = TurkEmptyJudgmentParser()
judgment_parser.print_taskgraph_info(new_judgments[0])

# for judgment in new_judgments:
#     judgment_parser.parse_taskgraph(judgment)

--------------------
Judgment id: 0
Query id: 0, Query: risotto without mushrooms
Document id: cooking+recipe1m+7c95cd05e17e43fd47582727f015a283, Title: Sausage and Mushroom Risotto
Requirements: 
  - 1 12 cups arborio rice
  - 1 tablespoon olive oil
  - 3 (14 ounce) cans chicken broth, reduced sodium (you may not need all of it)
  - 12 lb Italian sausage, casings removed (I use hot variety)
  - 12 lb baby bella mushroom, quartered
  - 14 onion, minced
  - herbs, your choice (I even like plain without, but rosemary, thyme for example are good)
  - black pepper
  - 12 cup parmesan cheese, grated
Steps:
1. In a small saucepan simmer broth.
2. Use medium pot.
3. On medium heat add olive oil to heat; brown meat while breaking up sausage, add onions.
4. When about half way done add mushrooms and saute until soft.
5. Stir in risotto.
6. Add one ladle full of broth.
7. Add herbs if using.
8. Reduce heat to simmer.
9. As risotto absorbs broth continue adding one ladle full at a time while stir

In [None]:
from models_indexes.bm25_model import BM25Model
