In [3]:
import sys

"""Fetch protobuffs"""
sys.path.insert(0, 'compiled_protobufs')

In [4]:
"""Install pygaggle requirements"""

import IPython
import sys
from pygaggle.rerank.transformer import MonoT5
!{sys.executable} -m pip install -r pygaggle/requirements.txt
IPython.Application.instance().kernel.do_shutdown(True)  

## Initialize dataset

In [None]:
""" Generate Taskgraphs from Datasets"""

from models_datasets.recipe_1m_model import AbstractModelDataset
from models_datasets.recipe_1m_model import Recipe1MModel
from models_datasets.wikihow_model import WikihowModel

dataset_models = {
    "COOKING" : Recipe1MModel,
    "DIY": WikihowModel,
}

for model_name, model in dataset_models.items():
    dataset_model: AbstractModelDataset = model()
    print(f"Dataset Model: {model_name}")
    dataset_model.convert_to_taskgraphs()

In [5]:
import pandas as pd

""" Preprocess and Load Queries """

# from query_pipeline import query_pipeline
# query_pipeline.preprocess_queries()

queries = {
    "COOKING" : pd.read_csv('queries/cooking.csv'),
    "DIY": pd.read_csv('queries/diy.csv'),
}

queries["COOKING"].head()

Unnamed: 0,id,raw query,domain,knowledge,complex,many points of view,different key entities,seasonal,regional,target query,short query
0,query-0,how to spatchcock a turkey,cooking,n,y,n,n,y,usa,spatchcock turkey,spatchcock a turkey
1,query-1,i want an easy to make dessert for christmas,cooking,n,y,y,n,y,-,easi dessert christma,easy to make dessert for christmas
2,query-2,any recommendations for a gluten free appetizer,cooking,y,n,y,y,n,-,gluten free appet,gluten free appetizer
3,query-3,how to make a green goddess salad,cooking,y,y,n,y,n,usa,green goddess salad,green goddess salad
4,query-4,how to make a classic english trifle,cooking,y,n,n,y,y,uk,classic english trifl,classic english trifle


## Initialize Models and Generate Runs

In [3]:
""" Optimize BM25 + RM3"""

import numpy as np
from models_indexes.bm25_grid_search import GridSearchCV

best_params = {}

# for domain in ["DIY", "COOKING"]:
for domain in ["COOKING"]:

    # Define the parameter grid to search over
    param_grid = {
        'k1': np.arange(0.1, 5.0, 0.1),
        'b': np.arange(0.1, 1.0, 0.1),
        'fb_terms': range(5, 101, 5),
        'fb_docs': range(5, 51, 5),
        'original_query_weight': np.arange(0.1, 1, 0.1),
    }

    grid_search = GridSearchCV(domain = domain, queries = queries[domain], params = param_grid, cv=5)
    best_params[domain] = grid_search.predict()

    print(f'Best parameters for {domain}: k1 = {grid_search.best_k1}, b = {grid_search.best_b}')

best_params

  from .autonotebook import tqdm as notebook_tqdm
2023-03-16 20:42:09.030311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 20:42:10.014539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-16 20:42:10.014636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


Generating run... k1=0.9, b=0.1
MAP scores: [0.15242315472663948, 0.15242969618158728, 0.15956860220448485, 0.14421153820853308, 0.152348027859856]
Generating run... k1=0.9, b=0.2
MAP scores: [0.1603446157896936, 0.16148065933437442, 0.1680478058027824, 0.15292824929830232, 0.1584846810193088]
Generating run... k1=0.9, b=0.30000000000000004
MAP scores: [0.16651991709636424, 0.1683681991952717, 0.17454891438932937, 0.15994263522605148, 0.16399935893960083]
Generating run... k1=0.9, b=0.4


In [4]:
""""Best parameters for BM25 + RM3"""

best_params_bm25 = {
    'DIY': {'best_k1': 2.4, 'best_b': 0.58},
    'COOKING': {'best_k1': 0.6, 'best_b': 0.4}
}

best_params_bm25_rm3 = {
    'DIY': {'best_k1': 2.4, 'best_b': 0.58, 'best_fb_terms': 57.0, 'best_fb_docs': 9.0, 'best_original_query_weight': 0.48},
    'COOKING': {'best_k1': 0.6, 'best_b': 0.4, 'best_fb_terms': 79.0, 'best_fb_docs': 16.0, 'best_original_query_weight': 0.68}
}

In [5]:
from models_indexes.bm25_model import BM25Model
from models_indexes.ance_model import AnceModel
from models_indexes.colbert_model import ColbertModel
from models_indexes.marqo_model import MarqoModel
from models_indexes.abstract_model import AbstractModel
from models_indexes.hybrid_model import HybridModel


""""Initialize Models"""

models = {}
for domain in  ["DIY", "COOKING"]:
    models[domain] = {
        "bm25" : BM25Model(domain = domain),
        "bm25-tuned" : BM25Model(domain = domain, params = best_params[domain]),
        "bm25+rm3" : BM25Model(domain = domain, rm3=True),
        "bm25+rm3-tuned" : BM25Model(domain = domain, rm3=True, params = best_params_bm25_rm3[domain]),
        "bm25+t5" : BM25Model(domain = domain, t5=True),
        "bm25+rm3+t5" : BM25Model(domain = domain, rm3=True, t5=True),
        "ance+t5": AnceModel(domain = domain, t5=True),
        "colbert+t5": ColbertModel(domain = domain, t5=True),
        "marqo": MarqoModel(domain = domain),
        "marqo+t5": MarqoModel(domain = domain, t5=True),
        "hybrid": HybridModel(domain = domain, rm3=True, t5=False),
        "hybrid+t5": HybridModel(domain = domain, rm3=True, t5=True)
    }

In [9]:
"""Build Indexes"""
for domain, index_models in models.items():
    for model_name, model in index_models.items():
        model.build_index()

'Build indexes'

In [8]:
"""Create run files simulation"""
print("Creating run files ")
for domain, index_models in models.items():
    qs = queries[domain]
    print(f"DOMAIN {domain}")
    for model_name, model in index_models.items():
        model.convert_search_results_to_run(qs)

'Create run files simulation'

In [None]:
""""Fielded retrieval with Marqo"""

fields = {
    "DIY": ["Title", "Steps", "Tags"],
    "COOKING": ["Title", "Steps", "Requirements"],
}

for domain in  ["DIY", "COOKING"]:
    model = MarqoModel(domain = "COOKING", t5 = False)
    model.build_index()
    qs = queries[domain]

    field1, field2, field3 = fields[domain]
    model.convert_search_results_to_run_attributes(qs, filters=[field1])
    model.convert_search_results_to_run_attributes(qs, filters=[field2])
    model.convert_search_results_to_run_attributes(qs, filters=[field3])
    model.convert_search_results_to_run_attributes(qs, filters=[field1, field2])
    model.convert_search_results_to_run_attributes(qs, filters=[field1, field3])
    model.convert_search_results_to_run_attributes(qs, filters=[field2, field3])
    model.convert_search_results_to_run_attributes(qs, filters=fields[domain])