In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
# 1. Load TREC24 product search dataset with QUERY field
goldens_df = pd.read_parquet("/notebook/trec24_golden.parquet")

# 2. Extract mainterm from QUERY field using simple heuristic
#   - If the query contains a signal word (e.g., "for", "with", "without"), the mainterm is the substring before the signal word
query_to_mainterm = defaultdict(dict)
supplemental_signals = {'for', 'with', 'without'}

for i in range(goldens_df.shape[0]):
    query_id = str(goldens_df.QUERY_ID[i])
    query = str(goldens_df.QUERY[i])
    tokens = query.split()
    mainterm = None
    for i, t in enumerate(tokens):
        if t.lower().strip(',').strip('.') in supplemental_signals:
            mainterm = ' '.join(tokens[:i])
            break
    query_to_mainterm[query_id]["query"] = query
    query_to_mainterm[query_id]["mainterm"] = mainterm

# 3. Add SOFTBOOST field to goldens_df for Cortex Search
goldens_df["SOFTBOOST"] = goldens_df.QUERY_ID.apply(
    lambda query_id: (
        [{"phrase": query_to_mainterm[str(query_id)]["mainterm"]}]
        if query_to_mainterm[str(query_id)]["mainterm"] else []
    )
)

# 4. Sanity check with a few sample values.
for i, data in enumerate(query_to_mainterm.values()):
    if i < 20 and data["mainterm"]:
        print('query: ' + data["query"])
        print('mainterm: ' + data["mainterm"])
        print()

query: plantar fasciiti brace with ball
mainterm: plantar fasciiti brace

query: refill ink kit for printer
mainterm: refill ink kit

query: key fob cover for 4 runner
mainterm: key fob cover

query: phone cover for iphone 8 plus
mainterm: phone cover

query: wireless game controller for ipad
mainterm: wireless game controller

query: replacement cushion for headphone
mainterm: replacement cushion

