# TF4ces Search Engine
## User Queries

Setup system paths

In [1]:
from pathlib import Path
import sys

__WORKSPACE__ = Path.cwd().parent.parent
sys.path.append(str(__WORKSPACE__))

__WORKSPACE__

In [None]:
# Native imports
from copy import deepcopy
from IPython.core.display import HTML

# Third-party imports
import pandas as pd

# User imports
from config.conf import __ALL_MODELS__
from src.TF4ces_search_engine.data.data_gathering import DataGathering
from src.main_ensemble import TF4cesSearchEnsemble
from src.utils.ensemble_strategy import EnsembleStrategy

In [None]:
from tqdm.autonotebook import tqdm

### Configs

In [3]:
# Model Names
TFIDF_MODEL = __ALL_MODELS__[0]
BM25_MODEL = __ALL_MODELS__[1]
MPNET_MODEL = __ALL_MODELS__[2]
ROBERTA_MODEL = __ALL_MODELS__[3]

# Dataset Configs
DATASET_NAME = 'lotte'
DATASET_CATEGORY = 'lifestyle'
SPLIT = 'test'

# Path Configs
TFIDF_MODEL_PATH = __WORKSPACE__ / "models/tfidf/lotte/lifestyle/tfidf.v0.0.1.pkl"
MPNET_EMB_PATH = __WORKSPACE__ / "dataset/embeddings_test/test_v0.0.1/all-mpnet-base-v2/lotte/lifestyle"
ROBERTA_EMB_PATH = __WORKSPACE__ / "dataset/embeddings_test/test_v0.0.1/all-roberta-large-v1/lotte/lifestyle"

In [4]:
FILTER_TOP_N = 3_000
VOTING_TOP_N = 100

FILTER_MODEL = {
    TFIDF_MODEL: {'model_path': TFIDF_MODEL_PATH},
    # BM25_MODEL: {'model_path': None},
}
VOTER_MODELS = {
    MPNET_MODEL: {'emb_path': MPNET_EMB_PATH},
    ROBERTA_MODEL: {'emb_path': ROBERTA_EMB_PATH},
}

#### Path Checks

In [5]:
drive_link = "https://drive.google.com/file/d/1x-lOZMwBK5Ea9eCDE18gQCTnukZ16xQT/view?usp=share_link"

if not MPNET_EMB_PATH.exists() or not ROBERTA_EMB_PATH.exists():
    raise Exception(f"Embeddings for MPNET model is missing at this path: '{MPNET_EMB_PATH}'.\n\n Please download embeddings from here '{drive_link}'.")

### Step 1 : Data Gathering

In [6]:
data_gathering = DataGathering(dataset_name=DATASET_NAME, )
docs_obj = data_gathering.get_documents(dataset_category=DATASET_CATEGORY, dataset_split=SPLIT)
del data_gathering

In [7]:
#sample doc
docs_obj[0]

{'document': 'Normal double-acting baking powder makes CO2 (thus giving a rising effect) in two ways: when it gets wet, and when it is heated. Baking soda only makes CO2 when it gets wet. From Wikipedia: The acid in a baking powder can be either fast-acting or slow-acting.[6] A fast-acting acid reacts in a wet mixture with baking soda at room temperature, and a slow-acting acid will not react until heated in an oven. Baking powders that contain both fast- and slow-acting acids are double acting; those that contain only one acid are single acting. By providing a second rise in the oven, double-acting baking powders increase the reliability of baked goods by rendering the time elapsed between mixing and baking less critical, and this is the type most widely available to consumers today.'}

### Step 2 : Init the Ensemble

In [8]:
ensemble = TF4cesSearchEnsemble(
    filter_model_dict=FILTER_MODEL,
    voter_models_dict=VOTER_MODELS,
    docs_obj=deepcopy(docs_obj),
    filter_top_n=FILTER_TOP_N,
    voting_top_n=VOTING_TOP_N,
)
ensemble.summary()

----------------------------------------------------------------------
		TF4ces Search Engine
----------------------------------------------------------------------
Filter Model	: ['tfidf']
Voter Models	: ['all-mpnet-base-v2', 'all-roberta-large-v1']
Num of Docs	: 119461
Filter Top N	: 3000
Voting Top N	: 100
----------------------------------------------------------------------



### Step 3 : Load models

In [9]:
ensemble.load_filter_model(bl_train=False)
ensemble.load_voter_models()

Model [TF-IDF] : Loaded with vocab_size (166387) from : '/Users/ataago/Documents/git/TF4ces/TF4ces-search-engine/models/tfidf/lotte/lifestyle/tfidf.v0.0.1.pkl'
Model [all-mpnet-base-v2] : Loaded on 'mps'
Model [all-roberta-large-v1] : Loaded on 'mps'


### Step 4: Query Helper Methods

In [10]:
def get_prediction_dfs(preds, user_queries_obj):
    pred_dfs = dict()
    for (q_id, q_data), pred_doc_ids in zip(user_queries_obj.items(), preds):
        pred_dfs[q_id] = pd.DataFrame(
            map(lambda doc_id: docs_obj[doc_id]['document'], pred_doc_ids[:5]),
            columns=[q_data['query']],
            index=pred_doc_ids[:5]
        )
    return pred_dfs

def query_it(query):
    
    # Generate query obj
    user_queries = [query]
    user_queries_obj = {q_id: {'query': query, 'rel_doc_ids': list()} for q_id, query in enumerate(user_queries)}
    
    # Filter docs using filter model
    q_ids, filtered_rel_doc_ids = ensemble.filter_docs(queries_obj=deepcopy(user_queries_obj), bl_eval=False)
    
    # Ensemble with Voter predictions
    #ensemble.load_embeddings(filtered_rel_doc_ids=filtered_rel_doc_ids)
    ensemble.load_embeddings_if_not_present(filtered_rel_doc_ids=filtered_rel_doc_ids)
    ensemble.find_most_relevant_docs(q_ids=q_ids, filtered_rel_doc_ids=filtered_rel_doc_ids, queries_obj=user_queries_obj)
    preds = ensemble.ensemble_voting(ensemble_strategy=EnsembleStrategy.TXIOI)
    
    # Generate data frame of docs
    pred_dfs = get_prediction_dfs(preds=preds, user_queries_obj=user_queries_obj)
    display(HTML(pred_dfs[0].to_html()))


### Test Query

In [11]:
query_it("are clear pomegranate seeds good to eat?")

Preprocessed data loaded from : /Users/ataago/Documents/git/TF4ces/TF4ces-search-engine/dataset/preprocessed/ensemble/tfidf


Pre-Processing Queries:   0%|          | 0/1 [00:00<?, ?it/s]

Filter Model [tfidf] : Retrieving top 3000 Docs for queries(1) documents(119461).
Model [TF-IDF] : Vector embeddings generated for queries(1) and docs (119461)
Model [TF-IDF] : Finding cosine similarities between Queries & Docs...


Voting Model [all-mpnet-base-v2] : Loading Embeddings:   0%|          | 0/3000 [00:00<?, ?it/s]

Voting Model [all-roberta-large-v1] : Loading Embeddings:   0%|          | 0/3000 [00:00<?, ?it/s]

Retrieve Top 100 docs using Voter Model [all-mpnet-base-v2]:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieve Top 100 docs using Voter Model [all-roberta-large-v1]:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,are clear pomegranate seeds good to eat?
32117,"I just opened one up from the tree in my yard. I was sad when I found white seeds on the inside. I was under the impression the seeds were red. The tree was here when I moved here so this fruit is a new experience for me. I tasted the white seeds and was surprised at the sweet flavor. Not tart in taste at all, that's what I was expecting from the google search I did. Only one of the pomegranates on the tree was split open so I am thinking I will watch carefully and give them a smidge longer, it's September and from what I read October is a good month for the fruit."
16020,"Apparently it's a different variety of pomegranate. They are good, but taste sweeter than the deep red seeds, which have a great tart flavor."
14700,Some pomegranate varieties produce pink or white seeds so yours sounds perfectly normal. I have had white seeds and they are as delicious as the red ones. Go for it.
9538,"There are a few seeds which are good to eat. These generally get sold in the supermarket (pumpkin seeds, apricot kernels). If you buy the fruit containing them, you can keep the seeds. If you want to store them, you should dry them first. Spread the cleaned seeds in a single layer on paper and put it in a warm dry place (not in direct sunlight), and wait a few days. You should deshell them right before consuming, that way they keep longer. If you want to eat them right away, don't dry them. They taste better when fresh. Some should be roasted in the shell before eaten (pumpkin seeds, sunflower seeds). There is little sense in eating seeds not commonly sold in supermarkets, like melon seeds or apple pips. They don't have any nutritional value for humans, or taste bad (e.g. very bitter), or both. It is even dangerous to experiment too much. For example, peach kernels contain poisonous cyanide compounds. So stick to what is commonly eaten, there is a reason the other seeds aren't used."
24746,"Got pomegranate with pale pink/white seed. I was surprised when i first opened it because i have been always getting the dark red seed varity. I thought i got a bad fruit. After I taste it, those pale pink/white seed taste sweeter and it is less tart than the drak red varity. It is also more juicy. I LOVED it!!"


In [12]:
query_it("do nutmeg and cinnamon go together?")

Preprocessed data loaded from : /Users/ataago/Documents/git/TF4ces/TF4ces-search-engine/dataset/preprocessed/ensemble/tfidf


Pre-Processing Queries:   0%|          | 0/1 [00:00<?, ?it/s]

Filter Model [tfidf] : Retrieving top 3000 Docs for queries(1) documents(119461).
Model [TF-IDF] : Vector embeddings generated for queries(1) and docs (119461)
Model [TF-IDF] : Finding cosine similarities between Queries & Docs...


Voting Model [all-mpnet-base-v2] : Loading Embeddings:   0%|          | 0/2885 [00:00<?, ?it/s]

Voting Model [all-roberta-large-v1] : Loading Embeddings:   0%|          | 0/2885 [00:00<?, ?it/s]

Retrieve Top 100 docs using Voter Model [all-mpnet-base-v2]:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieve Top 100 docs using Voter Model [all-roberta-large-v1]:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,do nutmeg and cinnamon go together?
33083,"Powdered sugar doesn't seem like a great thickener to me. It takes a lot to thicken a small amount of liquid, and as you've noticed, that means it'll end up pretty sweet. As an example, you can make a glaze (e.g. for cinnamon rolls) with a cup of powdered sugar and only a few tablespoons of liquid. Generally, people thicken things using starch, most commonly flour and cornstarch, but also things like potato starch, arrowroot powder, tapioca/cassava starch. They're stronger thickeners, and don't really have much flavor of their own. So I'd use as much sugar as you want to balance the pomegranate's sourness, and then use starch to thicken. You probably won't need extra sour at that point, but if you want it, you could certainly add something like lemon juice or vinegar."
39748,"Probably harmless. I'd go back to the store and tell them about it. If unsure, do not eat them and find another store."
33164,"I just used them in my chicken noodle soup - some cooked, some at the very last minute - and it was delicious! So much better than spending money on fresh parsley, which usually goes bad in my fridge before I can use all of it."
41814,"I'm pretty sure it's just crushing the husks a bit so they crack open - that's how I do it when I see ""lightly crushed"" for cardamom pods. It gives access to the seeds inside so flavor can infuse out of the pod and into the dish. The whole pod should be visible in the recipe, and removed before eating (would be a woody bite, else). If you crush the husk in smaller bits, which would be needed to get to & crush the seeds, it would be harder to find and fish out, and more likely some huskish bit would find its way into and leave its texture in a bite. Its similar to how cinnamon would be used in big chunks of a stick that can be fished out, or else ground really finely into dust, but not left in a dish in in-between-sized little fragments - the texture is just not desirable If the recipe wanted crushed or ground seeds to release the flavor, it would have asked for seeds instead of (or as well as) a whole pod."
41685,"Yes, they're called unstabilized oats, and they can go rancid, so must be refrigerated."


In [13]:
query_it("How do I flip an egg when preparing it over easy?")

Preprocessed data loaded from : /Users/ataago/Documents/git/TF4ces/TF4ces-search-engine/dataset/preprocessed/ensemble/tfidf


Pre-Processing Queries:   0%|          | 0/1 [00:00<?, ?it/s]

Filter Model [tfidf] : Retrieving top 3000 Docs for queries(1) documents(119461).
Model [TF-IDF] : Vector embeddings generated for queries(1) and docs (119461)
Model [TF-IDF] : Finding cosine similarities between Queries & Docs...


Voting Model [all-mpnet-base-v2] : Loading Embeddings:   0%|          | 0/2707 [00:00<?, ?it/s]

Voting Model [all-roberta-large-v1] : Loading Embeddings:   0%|          | 0/2707 [00:00<?, ?it/s]

Retrieve Top 100 docs using Voter Model [all-mpnet-base-v2]:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieve Top 100 docs using Voter Model [all-roberta-large-v1]:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,How do I flip an egg when preparing it over easy?
32334,"Pumpkin seeds roasted with a little of salty water is a great (and commonly sold and home made in MÃ©xico) snack. They are also used along with sunflower seeds in preparing some hot sauces and Mole (the hard way). Seeds of Cantaloupe (melon) and Watermelon can be blendend along with their edible parts to strengthen flavor and keep freshness when preparing fruit flavored/fresh water (""Agua fresca""). When watermelon seeds are really small, you can eat and chew them along with the fruit: they are usually sweet."
3710,"I think it depends on the dish. If it is a biryani or other rice dish, I'd be ok leaving them in because they are easy for the eater to see and remove. But if it is a wet curry for example, I'd probably do what Manako says - extract the seeds at the beginning rather than try to fish around for the pod later."
5414,"These are the seeds: These are the leaves of the more common variety (there are many others): While dried seeds are full of aroma and flavour, dried leaves are not. BTW, it is VERY easy to grow cilantro (as parsley) in a pot, just use the seeds ..."
927,"When they're still young, I grill them -- trim ends cut into planks about 1/4"" to 3/8"" thick (~1cm) toss in olive oil sprinkle with salt grill over direct heat flip when you develop good char marks. pull from the grill once the other side is slightly charred. As they get older, the seed cavity starts developing -- you can cut the sides off, leaving the middle, but it's just not as sweet."
10991,"I wouldn't trust any store brand to omit seeds entirely; it's just not going to be a priority for them. I'd suggest making your own instead. It's fairly easy and fast to seed tomatoes using a chinois, and to seed peppers using a knife. And it'll taste better, too!"


## Thank you