In [1]:
import sys

sys.path.insert(0, 'compiled_protobufs')

In [None]:
""" Generate Taskgraphs from datasets """

from models_datasets.recipe_1m_model import AbstractModelDataset
from models_datasets.recipe_1m_model import Recipe1MModel
from models_datasets.wikihow_model import WikihowModel

dataset_models = {
    "COOKING" : Recipe1MModel,
    "DIY": WikihowModel,
}

for model_name, model in dataset_models.items():
    dataset_model: AbstractModelDataset = model()
    print(f"Dataset Model: {model_name}")
    dataset_model.convert_to_taskgraphs()

In [6]:
import pandas as pd

""" Preprocess and load queries """

# from query_pipeline import query_pipeline
# query_pipeline.preprocess_queries()

queries = {
    "COOKING" : pd.read_csv('queries/cooking.csv'),
    "DIY": pd.read_csv('queries/diy.csv'),
}

queries["COOKING"].head()


Unnamed: 0,id,raw query,domain,knowledge,complex,many points of view,different key entities,seasonal,regional,target query
0,query-0,how to spatchcock a turkey,cooking,n,y,n,n,y,usa,spatchcock turkey
1,query-1,I want an easy to make dessert for christmas,cooking,n,y,y,n,y,-,easi dessert christma
2,query-2,any recommendations for a gluten free appetizer,cooking,y,n,y,y,n,-,gluten free appet
3,query-3,how to make a green goddess salad,cooking,y,y,n,y,n,usa,green goddess salad
4,query-4,how to make a classic english trifle,cooking,y,n,n,y,y,uk,classic english trifl


In [None]:
queries["DIY"].head()

In [4]:
from models_indexes.bm25_model import BM25Model
from models_indexes.ance_model import AnceModel
from models_indexes.colbert_model import ColbertModel
from models_indexes.marqo_model import MarqoModel
from models_indexes.abstract_model import AbstractModel

models = {}

for domain in  ["DIY"]:
# for domain in  ["COOKING"]:
    models[domain] = {
        # "bm25" : BM25Model(domain = domain),
        # "bm25+rm3" : BM25Model(domain = domain, rm3=True),
        # "bm25+t5" : BM25Model(domain = domain, t5=True),
        # "bm25+rm3+t5" : BM25Model(domain = domain, rm3=True, t5=True),
        "ance": AnceModel(domain = domain),
        "colbert": ColbertModel(domain = domain),
        # "marqo": MarqoModel(domain = domain),
        # "marqo": MarqoModel(domain = domain),
    }

In [2]:
from models_indexes.marqo_model import MarqoModel

# model = MarqoModel(domain = "DIY")
# model.build_index()
model.convert_search_results_to_run(pd, raw=False"")
# model.search("")

Generate index...
/home/ubuntu/task-search-quality/indexes/temp/diy/system_index_marqo/wikihow-taskmaps.json
Invalid HTTP request received.

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
134146   342  100    30  147035  448k  30000   437M --:--:-- --:--:-- --:--:--  437M


In [None]:
import json
path = "indexes/temp/cooking/system_index_marqo/recipe1m+-taskmaps.json"
# path2 = "indexes/temp/cooking/system_index_marqo/recipe1m+-taskmaps copy.json"

with open(path, "r") as f:
    obj = json.load(f)
    
# with open(path2, "r") as f:
#     obj2 = json.load(f)

print(len(obj))
# print(len(obj2))

# obj = obj[995500:]

# with open(path, 'w') as f:
#     f.write(json.dumps(obj, indent=4))

In [None]:
# build indexes
print("Build indexes")
for domain, index_models in models.items():
    for model_name, model in index_models.items():
        model.build_index()

In [7]:
# create run files
print("Creating run files ")
for domain, index_models in models.items():
    qs = queries[domain]
    print(f"DOMAIN {domain}")
    for model_name, model in index_models.items():
        model.convert_search_results_to_run(qs)
        
        

Creating run files 
DOMAIN DIY


2023-01-28 12:19:18,140 logger:'marqo' INFO search (tensor): took 0.293s to send query and received 50 results from Marqo (roundtrip). Marqo itself took 0.225s to execute the search.
2023-01-28 12:19:18 [INFO] index: search (tensor): took 0.293s to send query and received 50 results from Marqo (roundtrip). Marqo itself took 0.225s to execute the search.
2023-01-28 12:19:18,321 logger:'marqo' INFO search (tensor): took 0.180s to send query and received 50 results from Marqo (roundtrip). Marqo itself took 0.174s to execute the search.
2023-01-28 12:19:18 [INFO] index: search (tensor): took 0.180s to send query and received 50 results from Marqo (roundtrip). Marqo itself took 0.174s to execute the search.
2023-01-28 12:19:18,497 logger:'marqo' INFO search (tensor): took 0.174s to send query and received 50 results from Marqo (roundtrip). Marqo itself took 0.168s to execute the search.
2023-01-28 12:19:18 [INFO] index: search (tensor): took 0.174s to send query and received 50 results from

Run file saved at /home/ubuntu/task-search-quality/measurements/diy/run_files/mpnet-base.run


In [None]:
# create empty judgments
print("Creating empty judgments")
for domain, index_models in models.items():
    qs = queries[domain]
    print(f"DOMAIN {domain}")
    for model_name, model in index_models.items():
        model.create_empty_judgments(qs, k=50, n=10)

In [None]:
from models_indexes.marqo_model import MarqoModel
marqo_diy = MarqoModel(domain = "DIY")
marqo_diy.build_index()
# marqo_diy.search("cleaning ")

In [None]:
""" Build indexes from datasets """

# from models_indexes.bm25_model import BM25Model

# bm25 = BM25Model(domain = "DIY", t5=True)
# bm25.build_index()

# from models_indexes.ance_model import AnceModel
# ance: AnceModel = AnceModel(domain = "DIY")
# bm25.build_index()

# # bm25_cooking = BM25Model(domain = "DIY", t5=True)
# # bm25_cooking.build_index()

# from models_indexes.colbert_model import ColbertModel
# colbert: ColbertModel = ColbertModel(domain = "DIY")
# colbert.build_index()

# query = queries["COOKING"]["target query"][0]
# query = "pizza"
# colbert_cooking.search(query)
# colbert_diy.convert_search_results_to_run()
# colbert_diy.create_empty_judgments()

# from models_indexes.ance_model import AnceModel
# ance: AnceModel = AnceModel(domain = "COOKING")
# ance.build_index()
# ance.search("easi dessert christma")

# from models_indexes.colbert_model import ColbertModel
# colbert: ColbertModel = ColbertModel(domain = "COOKING")
# colbert.build_index()
# colbert.search("pizza")

# from models_indexes.colbert_model import ColbertModel
# colbert: ColbertModel = ColbertModel(domain = "DIY")
# colbert.build_index()
# colbert.search("kitchen")


In [None]:
from models_indexes.colbert_model import ColbertModel
colbert: ColbertModel = ColbertModel(domain = "COOKING")
colbert.build_index()
colbert.search("fish")

# marqo_cooking = MarqoModel(domain = "COOKING")
# marqo_cooking.build_index()
# marqo_cooking.search("fish")

# queries_path = os.path.join(os.getcwd(), "queries", "cooking.csv")

# pd_cooking_queries = pd.read_csv(queries_path).iloc[:10]
# bm25_food.convert_search_results_to_run(pd_cooking_queries)
# bm25_food.create_empty_judgments(pd_cooking_queries, 5)
# queries_path = os.path.join(os.getcwd(), "queries", "diy.csv")
# pd_diy_queries = pd.read_csv(queries_path).iloc[:10]
# ance_diy.build_index()
# ance_diy.convert_search_results_to_run(pd_diy_queries)

In [None]:
# ance_cooking = AnceModel(domain = "COOKING")
# ance_cooking.build_index()

In [None]:
# from models_indexes.bm25_model import BM25Model
# bm25_cooking = BM25Model(domain = "COOKING")
# bm25_cooking.build_index()

# queries_path = os.path.join(os.getcwd(), "queries", "cooking.csv")
# pd_cooking_queries = pd.read_csv(queries_path).iloc[:10]
# query = "granola"
# bm25_cooking.search(query)