In [None]:
pip install langdetect pandas numpy scikit-learn tqdm umap-learn


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=e5d1a18188a4243450f204a80bdd30ab781ca657d9bc86ac34ff7d47079eede8
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
pwd

'/content'

In [None]:
ls sample_data/data/


search_queries.xlsx


In [None]:
import pandas as pd

DATA_PATH = "sample_data/data/search_queries.xlsx"
SAMPLE_N = 1000
SEED = 42

df = pd.read_excel(DATA_PATH)

# pick the query column
candidates = {str(c).strip().lower(): c for c in df.columns}
for key in ["query", "queries", "search query", "search queries"]:
    if key in candidates:
        query_col = candidates[key]
        break
else:
    query_col = df.columns[0]

df_raw = (
    df[[query_col]]
    .rename(columns={query_col: "raw_query"})
    .dropna()
)
df_raw["raw_query"] = df_raw["raw_query"].astype(str).str.strip()
df_raw = df_raw[df_raw["raw_query"].str.len() > 0].drop_duplicates("raw_query").reset_index(drop=True)

df_sample = df_raw.sample(n=min(SAMPLE_N, len(df_raw)), random_state=SEED).reset_index(drop=True)
print("Using column:", query_col)
print("Unique queries:", len(df_raw))
print("Sample size:", len(df_sample))

df_sample.head(10)


Using column: Search queries
Unique queries: 156551
Sample size: 1000


Unnamed: 0,raw_query
0,"hilton head, south carolina"
1,grand park cozumel
2,pet friendly hotels puerto rico
3,airbnb williamstown kentucky
4,gorge campground
5,must do in paris
6,ireland cottages
7,lodge asheville
8,ciudad de tulum
9,condé nast santa fe hotels


In [None]:
import re, json
import numpy as np
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA

from sentence_transformers import SentenceTransformer

CONF_LOW = 0.40
CONF_MED = 0.70
RANDOM_SEED = 42

DEFAULT_ST_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMBED_BATCH_SIZE = 256
CLUSTER_BATCH_SIZE = 4096
PCA_COMPONENTS = 64  # set to None to skip PCA
PCA_BATCH_SIZE = 4096

CANONICAL_MAP: Dict[str, List[str]] = {
    "vacation_rental": [
        "vacation rental","holiday rental","home rental","house rental",
        "apartment rental","condo rental","villa rental","cabin rental",
        "properties for rent","property for rent","lodging","accommodations","places to stay",
        "short term rental","short-term rental","vacation home","rental home"
    ],
    "hotel": [
        "hotel","hotels","motel","motels","inn","inns","resort","resorts",
        "bed and breakfast","bed & breakfast","b&b","bnb",
        "guest house","guesthouse","hostel","hostels","lodge","lodges"
    ],
    "package": [
        "vacation package","travel package","package deals","bundle deals",
        "flight+hotel","flight and hotel","hotel and flight","all inclusive package","all-inclusive package"
    ],
    "all_inclusive": ["all inclusive","all-inclusive"],
    "cruise": ["cruise","cruises","shore excursion","shore excursions"],

    "car_rental": ["car rental","rent a car","rental car","car hire","vehicle rental","auto rental"],
    "train": ["train","trains","rail","railway","amtrak","eurail","train tickets"],
    "bus": ["bus","buses","coach","greyhound","flixbus","bus tickets"],
    "ferry": ["ferry","ferries","water taxi"],

    "beach": ["beach","beaches"],
    "hiking": ["hiking","hike","nature trail"],
    "snowboarding": ["snowboarding","snowboard"],
    "skiing": ["skiing","ski"],
    "tour": ["tour","tours","guided tour","walking tour"],
    "museum": ["museum","museums"],
    "sightseeing": ["sightseeing","things to do","attractions","tourist attractions"],

    "deal_modifier": ["deals","deal","discount","promo","coupon","sale","last minute","last-minute","cheap"],
    "time_modifier": ["tonight","today","tomorrow","weekend","this weekend","next weekend"],
}

VARIANT_TO_CANON: Dict[str, str] = {}
for canon, variants in CANONICAL_MAP.items():
    for v in variants:
        VARIANT_TO_CANON[v] = canon

BRAND_TO_GENERIC = {"airbnb": "vacation_rental", "vrbo": "vacation_rental", "homeaway": "vacation_rental"}
ABBREV_MAP = {"nyc": "new york city", "lax": "los angeles", "sfo": "san francisco"}

REGEX_SIGNALS = {
    "package": r"\b(vacation\s*package|travel\s*package|package\s*deal|package\s*deals|bundle|bundles|flight\s*\+\s*hotel|flight\s*and\s*hotel|hotel\s*and\s*flight|all\s*inclusive\s*package)\b",
    "flights": r"\b(flight|flights|airfare|plane\s*ticket|air\s*ticket|nonstop|non-stop|round\s*trip|one\s*way|multi\s*city|layover)\b",
    "car_rental": r"\b(car\s*rental|rent\s*a\s*car|rental\s*car|car\s*hire|vehicle\s*rental)\b",
    "train": r"\b(train|trains|rail|railway|amtrak|eurail|train\s*ticket|train\s*tickets)\b",
    "bus": r"\b(bus|buses|coach|greyhound|flixbus|bus\s*ticket|bus\s*tickets)\b",
    "ferry": r"\b(ferry|ferries|water\s*taxi)\b",
    "cruise": r"\b(cruise|cruises|shore\s*excursion|shore\s*excursions)\b",
    "stays": r"\b(hotel|hotels|motel|inn|resort|bnb|b&b|bed\s*and\s*breakfast|vacation_rental|lodging|accommodations|places\s*to\s*stay|hostel|guesthouse)\b",
}

US_STATES = {
    "alabama","alaska","arizona","arkansas","california","colorado","connecticut","delaware",
    "florida","georgia","hawaii","idaho","illinois","indiana","iowa","kansas","kentucky","louisiana",
    "maine","maryland","massachusetts","michigan","minnesota","mississippi","missouri","montana",
    "nebraska","nevada","new hampshire","new jersey","new mexico","new york","north carolina",
    "north dakota","ohio","oklahoma","oregon","pennsylvania","rhode island","south carolina",
    "south dakota","tennessee","texas","utah","vermont","virginia","washington","west virginia",
    "wisconsin","wyoming",
    # optional abbreviations
    "ca","ny","tx","fl","wa","il","pa","ma","ga","nc","sc"
}

COUNTRY_HINTS = {
    "usa","us","united states","canada","mexico","uk","united kingdom",
    "france","italy","spain","japan","india"
}

def is_destination_only_query(q: str) -> bool:
    q = (q or "").lower().strip()
    q_space = q.replace(",", " ")
    toks = [t for t in q_space.split() if t]

    # explicit geo signals
    has_state = any(t in US_STATES for t in toks)
    has_country = any(c in q for c in COUNTRY_HINTS)
    has_comma = "," in q

    # product signals (reuse your existing regex)
    has_product = (
        re.search(REGEX_SIGNALS["stays"], q) or
        re.search(REGEX_SIGNALS["flights"], q) or
        re.search(REGEX_SIGNALS["package"], q) or
        re.search(REGEX_SIGNALS["cruise"], q) or
        re.search(REGEX_SIGNALS["car_rental"], q) or
        re.search(REGEX_SIGNALS["train"], q) or
        re.search(REGEX_SIGNALS["bus"], q) or
        re.search(REGEX_SIGNALS["ferry"], q)
    )

    return (not has_product) and (has_state or has_country or has_comma) and len(toks) >= 2


def is_english(text: str) -> Tuple[bool, float]:
    t = (text or "").strip()
    if not t:
        return False, 0.0
    try:
        from langdetect import detect_langs
        langs = detect_langs(t)
        if not langs:
            return False, 0.0
        top = langs[0]
        return (top.lang == "en" and top.prob >= 0.80), float(top.prob)
    except Exception:
        letters = re.findall(r"[A-Za-z]", t)
        nonspace = re.findall(r"\S", t)
        ratio = len(letters) / max(1, len(nonspace))
        return (ratio >= 0.60), float(ratio)

@dataclass
class RunPolicy:
    remove_fillers: bool

FILLER_WORDS = {"best","top","near"}

def normalize_query(q: str, remove_fillers: bool) -> str:
    q = (q or "").lower().strip()
    q = q.replace("&", " and ")
    q = re.sub(r"\s+", " ", q).strip()
    toks = [ABBREV_MAP.get(t, t) for t in q.split()]
    q = " ".join(toks)
    if remove_fillers:
        q = " ".join([t for t in q.split() if t not in FILLER_WORDS])
    return re.sub(r"\s+", " ", q).strip()

def find_variants(text: str) -> List[Tuple[str, str]]:
    variants_sorted = sorted(VARIANT_TO_CANON.keys(), key=lambda x: len(x.split()), reverse=True)
    out = []
    for v in variants_sorted:
        if re.search(r"\b" + re.escape(v) + r"\b", text):
            out.append((v, VARIANT_TO_CANON[v]))
    return out

def apply_canonicalization(cleaned_q: str) -> Tuple[str, Optional[List[str]], List[str], Dict[str, Any]]:
    reason_codes: List[str] = []
    signals: Dict[str, Any] = {}
    canonical_q = cleaned_q

    for b, g in BRAND_TO_GENERIC.items():
        if re.search(r"\b" + re.escape(b) + r"\b", canonical_q):
            canonical_q = re.sub(r"\b" + re.escape(b) + r"\b", g, canonical_q)
            reason_codes.append("RC_BOUNDED_INFERENCE")

    matches = find_variants(canonical_q)

    lodging_hits = [c for _, c in matches if c in {"hotel", "vacation_rental"}]
    if len(set(lodging_hits)) >= 2:
        reason_codes.append("RC_CANONICAL_AMBIGUOUS")
    else:
        for v, c in matches:
            canonical_q = re.sub(r"\b" + re.escape(v) + r"\b", c, canonical_q)

    non_activity = {"hotel","vacation_rental","package","all_inclusive","cruise","car_rental","train","bus","ferry","deal_modifier","time_modifier"}
    activity_types = []
    for _, c in matches:
        if c not in non_activity and c not in activity_types:
            activity_types.append(c)
    if activity_types:
        reason_codes.append("RC_BOUNDED_INFERENCE")

    signals["has_deal_modifier"] = int(re.search(r"\b(deals?|cheap|discount|promo|coupon|sale|last\s*minute|last-minute)\b", canonical_q) is not None)
    signals["has_time_modifier"] = int(re.search(r"\b(tonight|today|tomorrow|weekend|this\s*weekend|next\s*weekend)\b", canonical_q) is not None)
    return canonical_q, (activity_types if activity_types else None), sorted(list(set(reason_codes))), signals

def route_vertical(canonical_q: str, activity_types: Optional[List[str]]) -> Tuple[str, List[str]]:
    q = canonical_q

    if re.search(REGEX_SIGNALS["package"], q):
        return "packages", ["RC_VERTICAL_FROM_REGEX"]

    if re.search(REGEX_SIGNALS["flights"], q):
        return "flights", ["RC_VERTICAL_FROM_REGEX"]

    if (re.search(REGEX_SIGNALS["car_rental"], q) or re.search(REGEX_SIGNALS["train"], q) or
        re.search(REGEX_SIGNALS["bus"], q) or re.search(REGEX_SIGNALS["ferry"], q)):
        return "ground_transport", ["RC_VERTICAL_FROM_REGEX"]

    if re.search(REGEX_SIGNALS["cruise"], q):
        return "cruises", ["RC_VERTICAL_FROM_REGEX"]

    if re.search(REGEX_SIGNALS["stays"], q):
        return "stays", ["RC_VERTICAL_FROM_REGEX"]

    # Activities from ontology
    if activity_types:
        return "activities", ["RC_BOUNDED_INFERENCE"]

    # ✅ Destination-only routing (NEW)
    if is_destination_only_query(q):
        return "destination_only", ["RC_DESTINATION_ONLY"]

    return "unknown", ["RC_VERTICAL_UNKNOWN"]


def detect_non_travel_ambiguity(canonical_q: str, vertical: str) -> bool:
    q = canonical_q.strip()
    if re.search(r"\b\d{3,}\b", q) and re.search(r"\b(st|rd|ave|dr|road|blvd|lane|ln)\b", q):
        return True
    if q in {"christmas","thanksgiving","easter"}:
        return True
    if vertical == "unknown" and len(q.split()) <= 2:
        return True
    return False

def confidence_score(vertical: str, canonical_q: str, reason_codes: List[str]) -> float:
    if vertical == "stays": conf = 0.80
    elif vertical == "activities": conf = 0.60
    elif vertical == "flights": conf = 0.75
    elif vertical == "ground_transport": conf = 0.70
    elif vertical == "packages": conf = 0.70
    elif vertical == "cruises": conf = 0.70
    else: conf = 0.20
    if "RC_BOUNDED_INFERENCE" in reason_codes:
        conf = min(conf, 0.70)
    if len(canonical_q.split()) <= 1:
        conf = min(conf, 0.40)
    return float(max(0.0, min(1.0, conf)))

def local_phase0_single_run(raw_query: str, policy: RunPolicy) -> Dict[str, Any]:
    normalized = normalize_query(raw_query, policy.remove_fillers)
    canonical_q, activity_types, rcs, signals = apply_canonicalization(normalized)
    vertical, route_rcs = route_vertical(canonical_q, activity_types)
    rcs = sorted(list(set(rcs + route_rcs)))
    if detect_non_travel_ambiguity(canonical_q, vertical):
        rcs = sorted(list(set(rcs + ["RC_NON_TRAVEL_AMBIGUITY"])))
    conf = confidence_score(vertical, canonical_q, rcs)
    label = "low" if conf < CONF_LOW else ("medium" if conf < CONF_MED else "high")
    return {
        "canonical_query": canonical_q,
        "vertical_primary": vertical,
        "activity_type": activity_types,
        "intent_confidence_score": conf,
        "intent_confidence_label": label,
        "reason_codes": rcs,
        "signals": signals
    }

def phase0_extract_all(df_raw: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for q in tqdm(df_raw["raw_query"].tolist(), desc="Phase0 local"):
        ok_en, lang_conf = is_english(q)
        if not ok_en:
            rows.append({
                "raw_query": q,
                "canonical_query": q.lower(),
                "vertical_primary": "unknown",
                "activity_type": json.dumps(None),
                "intent_confidence_score": 0.0,
                "intent_confidence_label": "low",
                "reason_codes": json.dumps(["RC_NON_ENGLISH","RC_LOW_CONFIDENCE_DUE_TO_LANGUAGE"]),
                "has_deal_modifier": 0,
                "has_time_modifier": 0,
            })
            continue

        r1 = local_phase0_single_run(q, RunPolicy(remove_fillers=False))
        # lightweight second run (remove weak fillers)
        r2 = local_phase0_single_run(q, RunPolicy(remove_fillers=True))
        final = r1 if r1["intent_confidence_score"] >= r2["intent_confidence_score"] else r2

        rows.append({
            "raw_query": q,
            "canonical_query": final["canonical_query"],
            "vertical_primary": final["vertical_primary"],
            "activity_type": json.dumps(final["activity_type"], ensure_ascii=False),
            "intent_confidence_score": final["intent_confidence_score"],
            "intent_confidence_label": final["intent_confidence_label"],
            "reason_codes": json.dumps(final["reason_codes"], ensure_ascii=False),
            "has_deal_modifier": int(final["signals"].get("has_deal_modifier", 0)),
            "has_time_modifier": int(final["signals"].get("has_time_modifier", 0)),
        })
    return pd.DataFrame(rows)

def has_reason_code(rc_json: str, code: str) -> bool:
    try:
        return code in json.loads(rc_json)
    except Exception:
        return False

def assign_ambiguity_bucket(df_phase0: pd.DataFrame) -> pd.Series:
    low_conf = df_phase0["intent_confidence_score"].fillna(0.0) < CONF_LOW
    non_travel = df_phase0["reason_codes"].apply(lambda s: has_reason_code(s, "RC_NON_TRAVEL_AMBIGUITY"))
    non_english = df_phase0["reason_codes"].apply(lambda s: has_reason_code(s, "RC_NON_ENGLISH"))
    too_short = df_phase0["canonical_query"].fillna("").apply(lambda s: len(str(s).split()) <= 1)
    return np.where(low_conf | non_travel | non_english | too_short, "ambiguous", "clear")

def build_symbolic_features(df_phase0: pd.DataFrame) -> pd.DataFrame:
    out = df_phase0.copy()
    out["conf_bucket"] = pd.cut(
        out["intent_confidence_score"].fillna(0.0),
        bins=[-0.01, CONF_LOW, CONF_MED, 1.01],
        labels=["low", "medium", "high"]
    ).astype(str)

    out["used_bounded_inference"] = out["reason_codes"].apply(lambda s: int(has_reason_code(s, "RC_BOUNDED_INFERENCE")))

    def first_activity(a_json: str) -> str:
        try:
            a = json.loads(a_json) if a_json else None
            if isinstance(a, list) and len(a) > 0:
                return a[0]
            if isinstance(a, str):
                return a
            return "none"
        except Exception:
            return "none"

    out["activity_primary"] = out["activity_type"].apply(first_activity)

    return out[[
        "raw_query","canonical_query","vertical_primary","activity_primary",
        "conf_bucket","used_bounded_inference","intent_confidence_score",
        "has_deal_modifier","has_time_modifier"
    ]]

def fit_symbolic_encoder(sym_df: pd.DataFrame) -> ColumnTransformer:
    cat_cols = ["vertical_primary", "activity_primary", "conf_bucket"]
    num_cols = ["used_bounded_inference", "intent_confidence_score", "has_deal_modifier", "has_time_modifier"]

    pre = ColumnTransformer(
        transformers=[
            ("cat", Pipeline(steps=[
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("oh", OneHotEncoder(handle_unknown="ignore"))
            ]), cat_cols),
            ("num", Pipeline(steps=[
                ("imp", SimpleImputer(strategy="constant", fill_value=0.0))
            ]), num_cols),
        ],
        remainder="drop"
    )
    pre.fit(sym_df)
    return pre

def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int = EMBED_BATCH_SIZE) -> np.ndarray:
    emb = model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
    return np.asarray(emb, dtype=np.float32)


In [None]:
import re, json
import numpy as np
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA

from sentence_transformers import SentenceTransformer

CONF_LOW = 0.40
CONF_MED = 0.70
RANDOM_SEED = 42

DEFAULT_ST_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMBED_BATCH_SIZE = 256
CLUSTER_BATCH_SIZE = 4096
PCA_COMPONENTS = 64  # set to None to skip PCA
PCA_BATCH_SIZE = 4096

CANONICAL_MAP: Dict[str, List[str]] = {
    "vacation_rental": [
        "vacation rental","holiday rental","home rental","house rental",
        "apartment rental","condo rental","villa rental","cabin rental",
        "properties for rent","property for rent","lodging","accommodations","places to stay",
        "short term rental","short-term rental","vacation home","rental home"
    ],
    "hotel": [
        "hotel","hotels","motel","motels","inn","inns","resort","resorts",
        "bed and breakfast","bed & breakfast","b&b","bnb",
        "guest house","guesthouse","hostel","hostels","lodge","lodges"
    ],
    "package": [
        "vacation package","travel package","package deals","bundle deals",
        "flight+hotel","flight and hotel","hotel and flight","all inclusive package","all-inclusive package"
    ],
    "all_inclusive": ["all inclusive","all-inclusive"],
    "cruise": ["cruise","cruises","shore excursion","shore excursions"],

    "car_rental": ["car rental","rent a car","rental car","car hire","vehicle rental","auto rental"],
    "train": ["train","trains","rail","railway","amtrak","eurail","train tickets"],
    "bus": ["bus","buses","coach","greyhound","flixbus","bus tickets"],
    "ferry": ["ferry","ferries","water taxi"],

    "beach": ["beach","beaches"],
    "hiking": ["hiking","hike","nature trail"],
    "snowboarding": ["snowboarding","snowboard"],
    "skiing": ["skiing","ski"],
    "tour": ["tour","tours","guided tour","walking tour"],
    "museum": ["museum","museums"],
    "sightseeing": ["sightseeing","things to do","attractions","tourist attractions"],

    "deal_modifier": ["deals","deal","discount","promo","coupon","sale","last minute","last-minute","cheap"],
    "time_modifier": ["tonight","today","tomorrow","weekend","this weekend","next weekend"],
}

VARIANT_TO_CANON: Dict[str, str] = {}
for canon, variants in CANONICAL_MAP.items():
    for v in variants:
        VARIANT_TO_CANON[v] = canon

BRAND_TO_GENERIC = {"airbnb": "vacation_rental", "vrbo": "vacation_rental", "homeaway": "vacation_rental"}
ABBREV_MAP = {"nyc": "new york city", "lax": "los angeles", "sfo": "san francisco"}

REGEX_SIGNALS = {
    "package": r"\b(vacation\s*package|travel\s*package|package\s*deal|package\s*deals|bundle|bundles|flight\s*\+\s*hotel|flight\s*and\s*hotel|hotel\s*and\s*flight|all\s*inclusive\s*package)\b",
    "flights": r"\b(flight|flights|airfare|plane\s*ticket|air\s*ticket|nonstop|non-stop|round\s*trip|one\s*way|multi\s*city|layover)\b",
    "car_rental": r"\b(car\s*rental|rent\s*a\s*car|rental\s*car|car\s*hire|vehicle\s*rental)\b",
    "train": r"\b(train|trains|rail|railway|amtrak|eurail|train\s*ticket|train\s*tickets)\b",
    "bus": r"\b(bus|buses|coach|greyhound|flixbus|bus\s*ticket|bus\s*tickets)\b",
    "ferry": r"\b(ferry|ferries|water\s*taxi)\b",
    "cruise": r"\b(cruise|cruises|shore\s*excursion|shore\s*excursions)\b",
    "stays": r"\b(hotel|hotels|motel|inn|resort|bnb|b&b|bed\s*and\s*breakfast|vacation_rental|lodging|accommodations|places\s*to\s*stay|hostel|guesthouse)\b",
}

def is_english(text: str) -> Tuple[bool, float]:
    t = (text or "").strip()
    if not t:
        return False, 0.0
    try:
        from langdetect import detect_langs
        langs = detect_langs(t)
        if not langs:
            return False, 0.0
        top = langs[0]
        return (top.lang == "en" and top.prob >= 0.80), float(top.prob)
    except Exception:
        letters = re.findall(r"[A-Za-z]", t)
        nonspace = re.findall(r"\S", t)
        ratio = len(letters) / max(1, len(nonspace))
        return (ratio >= 0.60), float(ratio)

@dataclass
class RunPolicy:
    remove_fillers: bool

FILLER_WORDS = {"best","top","near"}

def normalize_query(q: str, remove_fillers: bool) -> str:
    q = (q or "").lower().strip()
    q = q.replace("&", " and ")
    q = re.sub(r"\s+", " ", q).strip()
    toks = [ABBREV_MAP.get(t, t) for t in q.split()]
    q = " ".join(toks)
    if remove_fillers:
        q = " ".join([t for t in q.split() if t not in FILLER_WORDS])
    return re.sub(r"\s+", " ", q).strip()

def find_variants(text: str) -> List[Tuple[str, str]]:
    variants_sorted = sorted(VARIANT_TO_CANON.keys(), key=lambda x: len(x.split()), reverse=True)
    out = []
    for v in variants_sorted:
        if re.search(r"\b" + re.escape(v) + r"\b", text):
            out.append((v, VARIANT_TO_CANON[v]))
    return out

def apply_canonicalization(cleaned_q: str) -> Tuple[str, Optional[List[str]], List[str], Dict[str, Any]]:
    reason_codes: List[str] = []
    signals: Dict[str, Any] = {}
    canonical_q = cleaned_q

    for b, g in BRAND_TO_GENERIC.items():
        if re.search(r"\b" + re.escape(b) + r"\b", canonical_q):
            canonical_q = re.sub(r"\b" + re.escape(b) + r"\b", g, canonical_q)
            reason_codes.append("RC_BOUNDED_INFERENCE")

    matches = find_variants(canonical_q)

    lodging_hits = [c for _, c in matches if c in {"hotel", "vacation_rental"}]
    if len(set(lodging_hits)) >= 2:
        reason_codes.append("RC_CANONICAL_AMBIGUOUS")
    else:
        for v, c in matches:
            canonical_q = re.sub(r"\b" + re.escape(v) + r"\b", c, canonical_q)

    non_activity = {"hotel","vacation_rental","package","all_inclusive","cruise","car_rental","train","bus","ferry","deal_modifier","time_modifier"}
    activity_types = []
    for _, c in matches:
        if c not in non_activity and c not in activity_types:
            activity_types.append(c)
    if activity_types:
        reason_codes.append("RC_BOUNDED_INFERENCE")

    signals["has_deal_modifier"] = int(re.search(r"\b(deals?|cheap|discount|promo|coupon|sale|last\s*minute|last-minute)\b", canonical_q) is not None)
    signals["has_time_modifier"] = int(re.search(r"\b(tonight|today|tomorrow|weekend|this\s*weekend|next\s*weekend)\b", canonical_q) is not None)
    return canonical_q, (activity_types if activity_types else None), sorted(list(set(reason_codes))), signals

def route_vertical(canonical_q: str, activity_types: Optional[List[str]]) -> Tuple[str, List[str]]:
    q = canonical_q
    if re.search(REGEX_SIGNALS["package"], q):
        return "packages", ["RC_VERTICAL_FROM_REGEX"]
    if re.search(REGEX_SIGNALS["flights"], q):
        return "flights", ["RC_VERTICAL_FROM_REGEX"]
    if re.search(REGEX_SIGNALS["car_rental"], q) or re.search(REGEX_SIGNALS["train"], q) or re.search(REGEX_SIGNALS["bus"], q) or re.search(REGEX_SIGNALS["ferry"], q):
        return "ground_transport", ["RC_VERTICAL_FROM_REGEX"]
    if re.search(REGEX_SIGNALS["cruise"], q):
        return "cruises", ["RC_VERTICAL_FROM_REGEX"]
    if re.search(REGEX_SIGNALS["stays"], q):
        return "stays", ["RC_VERTICAL_FROM_REGEX"]
    if activity_types:
        return "activities", ["RC_BOUNDED_INFERENCE"]
    return "unknown", ["RC_VERTICAL_UNKNOWN"]

def detect_non_travel_ambiguity(canonical_q: str, vertical: str) -> bool:
    q = canonical_q.strip()
    if re.search(r"\b\d{3,}\b", q) and re.search(r"\b(st|rd|ave|dr|road|blvd|lane|ln)\b", q):
        return True
    if q in {"christmas","thanksgiving","easter"}:
        return True
    if vertical == "unknown" and len(q.split()) <= 2:
        return True
    return False

def confidence_score(vertical: str, canonical_q: str, reason_codes: List[str]) -> float:
    if vertical == "stays": conf = 0.80
    elif vertical == "activities": conf = 0.60
    elif vertical == "flights": conf = 0.75
    elif vertical == "ground_transport": conf = 0.70
    elif vertical == "packages": conf = 0.70
    elif vertical == "cruises": conf = 0.70
    else: conf = 0.20
    if "RC_BOUNDED_INFERENCE" in reason_codes:
        conf = min(conf, 0.70)
    if len(canonical_q.split()) <= 1:
        conf = min(conf, 0.40)
    return float(max(0.0, min(1.0, conf)))

def local_phase0_single_run(raw_query: str, policy: RunPolicy) -> Dict[str, Any]:
    normalized = normalize_query(raw_query, policy.remove_fillers)
    canonical_q, activity_types, rcs, signals = apply_canonicalization(normalized)
    vertical, route_rcs = route_vertical(canonical_q, activity_types)
    rcs = sorted(list(set(rcs + route_rcs)))
    if detect_non_travel_ambiguity(canonical_q, vertical):
        rcs = sorted(list(set(rcs + ["RC_NON_TRAVEL_AMBIGUITY"])))
    conf = confidence_score(vertical, canonical_q, rcs)
    label = "low" if conf < CONF_LOW else ("medium" if conf < CONF_MED else "high")
    return {
        "canonical_query": canonical_q,
        "vertical_primary": vertical,
        "activity_type": activity_types,
        "intent_confidence_score": conf,
        "intent_confidence_label": label,
        "reason_codes": rcs,
        "signals": signals
    }

def phase0_extract_all(df_raw: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for q in tqdm(df_raw["raw_query"].tolist(), desc="Phase0 local"):
        ok_en, lang_conf = is_english(q)
        if not ok_en:
            rows.append({
                "raw_query": q,
                "canonical_query": q.lower(),
                "vertical_primary": "unknown",
                "activity_type": json.dumps(None),
                "intent_confidence_score": 0.0,
                "intent_confidence_label": "low",
                "reason_codes": json.dumps(["RC_NON_ENGLISH","RC_LOW_CONFIDENCE_DUE_TO_LANGUAGE"]),
                "has_deal_modifier": 0,
                "has_time_modifier": 0,
            })
            continue

        r1 = local_phase0_single_run(q, RunPolicy(remove_fillers=False))
        # lightweight second run (remove weak fillers)
        r2 = local_phase0_single_run(q, RunPolicy(remove_fillers=True))
        final = r1 if r1["intent_confidence_score"] >= r2["intent_confidence_score"] else r2

        rows.append({
            "raw_query": q,
            "canonical_query": final["canonical_query"],
            "vertical_primary": final["vertical_primary"],
            "activity_type": json.dumps(final["activity_type"], ensure_ascii=False),
            "intent_confidence_score": final["intent_confidence_score"],
            "intent_confidence_label": final["intent_confidence_label"],
            "reason_codes": json.dumps(final["reason_codes"], ensure_ascii=False),
            "has_deal_modifier": int(final["signals"].get("has_deal_modifier", 0)),
            "has_time_modifier": int(final["signals"].get("has_time_modifier", 0)),
        })
    return pd.DataFrame(rows)

def has_reason_code(rc_json: str, code: str) -> bool:
    try:
        return code in json.loads(rc_json)
    except Exception:
        return False

def assign_ambiguity_bucket(df_phase0: pd.DataFrame) -> pd.Series:
    low_conf = df_phase0["intent_confidence_score"].fillna(0.0) < CONF_LOW
    non_travel = df_phase0["reason_codes"].apply(lambda s: has_reason_code(s, "RC_NON_TRAVEL_AMBIGUITY"))
    non_english = df_phase0["reason_codes"].apply(lambda s: has_reason_code(s, "RC_NON_ENGLISH"))
    too_short = df_phase0["canonical_query"].fillna("").apply(lambda s: len(str(s).split()) <= 1)
    return np.where(low_conf | non_travel | non_english | too_short, "ambiguous", "clear")

def build_symbolic_features(df_phase0: pd.DataFrame) -> pd.DataFrame:
    out = df_phase0.copy()
    out["conf_bucket"] = pd.cut(
        out["intent_confidence_score"].fillna(0.0),
        bins=[-0.01, CONF_LOW, CONF_MED, 1.01],
        labels=["low", "medium", "high"]
    ).astype(str)

    out["used_bounded_inference"] = out["reason_codes"].apply(lambda s: int(has_reason_code(s, "RC_BOUNDED_INFERENCE")))

    def first_activity(a_json: str) -> str:
        try:
            a = json.loads(a_json) if a_json else None
            if isinstance(a, list) and len(a) > 0:
                return a[0]
            if isinstance(a, str):
                return a
            return "none"
        except Exception:
            return "none"

    out["activity_primary"] = out["activity_type"].apply(first_activity)

    return out[[
        "raw_query","canonical_query","vertical_primary","activity_primary",
        "conf_bucket","used_bounded_inference","intent_confidence_score",
        "has_deal_modifier","has_time_modifier"
    ]]

def fit_symbolic_encoder(sym_df: pd.DataFrame) -> ColumnTransformer:
    cat_cols = ["vertical_primary", "activity_primary", "conf_bucket"]
    num_cols = ["used_bounded_inference", "intent_confidence_score", "has_deal_modifier", "has_time_modifier"]

    pre = ColumnTransformer(
        transformers=[
            ("cat", Pipeline(steps=[
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("oh", OneHotEncoder(handle_unknown="ignore"))
            ]), cat_cols),
            ("num", Pipeline(steps=[
                ("imp", SimpleImputer(strategy="constant", fill_value=0.0))
            ]), num_cols),
        ],
        remainder="drop"
    )
    pre.fit(sym_df)
    return pre

def embed_texts(model: SentenceTransformer, texts: List[str], batch_size: int = EMBED_BATCH_SIZE) -> np.ndarray:
    emb = model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
    return np.asarray(emb, dtype=np.float32)


In [None]:
K_PERSONAS = 5

# Phase 0 signals
df_phase0 = phase0_extract_all(df_sample)

# Ambiguity bucket
df_phase0["ambiguity_bucket"] = assign_ambiguity_bucket(df_phase0)

# Symbolic features
sym = build_symbolic_features(df_phase0)
sym_encoder = fit_symbolic_encoder(sym)

# Sentence-transformers model
st_model = SentenceTransformer(DEFAULT_ST_MODEL)

# Cluster only "clear" to avoid poisoning personas
clear_mask = (df_phase0["ambiguity_bucket"] == "clear")
df_clear = df_phase0[clear_mask].reset_index(drop=True)
sym_clear = sym[clear_mask].reset_index(drop=True)

print("Clear:", len(df_clear), "Ambiguous:", (~clear_mask).sum())

# Build full X for 1k (safe to do in-memory)
emb = embed_texts(st_model, df_clear["canonical_query"].fillna("").astype(str).tolist(), batch_size=EMBED_BATCH_SIZE)

sym_mat = sym_encoder.transform(sym_clear)
sym_mat = sym_mat.toarray() if hasattr(sym_mat, "toarray") else np.asarray(sym_mat)
sym_mat = sym_mat.astype(np.float32, copy=False)

X = np.hstack([emb, sym_mat]).astype(np.float32, copy=False)

# Optional PCA
ipca = None
if PCA_COMPONENTS is not None and PCA_COMPONENTS > 0 and X.shape[0] >= PCA_COMPONENTS:
    ipca = IncrementalPCA(n_components=PCA_COMPONENTS)
    ipca.fit(X)
    Xr = ipca.transform(X).astype(np.float32, copy=False)
else:
    Xr = X

# Cluster
km = MiniBatchKMeans(n_clusters=K_PERSONAS, random_state=RANDOM_SEED, batch_size=1024, n_init="auto")
labels_clear = km.fit_predict(Xr)

# Attach labels back: ambiguous = -1
persona_cluster = np.full(len(df_phase0), -1, dtype=np.int32)
persona_cluster[clear_mask.values] = labels_clear
df_phase0["persona_cluster"] = persona_cluster

df_phase0.head(10)


Phase0 local: 100%|██████████| 1000/1000 [00:07<00:00, 128.68it/s]


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Clear: 186 Ambiguous: 814


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,raw_query,canonical_query,vertical_primary,activity_type,intent_confidence_score,intent_confidence_label,reason_codes,has_deal_modifier,has_time_modifier,ambiguity_bucket,persona_cluster
0,"hilton head, south carolina","hilton head, south carolina",unknown,,0.2,low,"[""RC_VERTICAL_UNKNOWN""]",0,0,ambiguous,-1
1,grand park cozumel,grand park cozumel,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
2,pet friendly hotels puerto rico,pet friendly hotel puerto rico,stays,,0.8,high,"[""RC_VERTICAL_FROM_REGEX""]",0,0,clear,0
3,airbnb williamstown kentucky,vacation_rental williamstown kentucky,stays,,0.7,high,"[""RC_BOUNDED_INFERENCE"", ""RC_VERTICAL_FROM_REG...",0,0,clear,2
4,gorge campground,gorge campground,unknown,,0.2,low,"[""RC_NON_TRAVEL_AMBIGUITY"", ""RC_VERTICAL_UNKNO...",0,0,ambiguous,-1
5,must do in paris,must do in paris,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
6,ireland cottages,ireland cottages,unknown,,0.2,low,"[""RC_NON_TRAVEL_AMBIGUITY"", ""RC_VERTICAL_UNKNO...",0,0,ambiguous,-1
7,lodge asheville,hotel asheville,stays,,0.8,high,"[""RC_VERTICAL_FROM_REGEX""]",0,0,clear,0
8,ciudad de tulum,ciudad de tulum,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
9,condé nast santa fe hotels,condé nast santa fe hotels,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1


In [None]:
def summarize_clusters(df_phase0: pd.DataFrame, sym: pd.DataFrame) -> Dict[int, Dict[str, Any]]:
    tmp = sym.copy()
    tmp["persona_cluster"] = df_phase0["persona_cluster"].values
    tmp["ambiguity_bucket"] = df_phase0["ambiguity_bucket"].values

    out = {}
    for c in sorted(tmp[tmp["persona_cluster"] >= 0]["persona_cluster"].unique()):
        chunk = tmp[(tmp["persona_cluster"] == c)]
        out[int(c)] = {
            "size": int(len(chunk)),
            "top_vertical": chunk["vertical_primary"].value_counts().head(5).to_dict(),
            "top_activity": chunk["activity_primary"].value_counts().head(8).to_dict(),
            "conf_bucket": chunk["conf_bucket"].value_counts().to_dict(),
            "bounded_inference_rate": float(chunk["used_bounded_inference"].mean()),
            "deal_modifier_rate": float(chunk["has_deal_modifier"].mean()),
            "time_modifier_rate": float(chunk["has_time_modifier"].mean()),
            "sample_queries": chunk["raw_query"].head(12).tolist(),
        }
    return out

summaries = summarize_clusters(df_phase0, sym)
summaries


{0: {'size': 101,
  'top_vertical': {'stays': 100, 'flights': 1},
  'top_activity': {'none': 101},
  'conf_bucket': {'high': 101},
  'bounded_inference_rate': 0.0,
  'deal_modifier_rate': 0.0,
  'time_modifier_rate': 0.0,
  'sample_queries': ['pet friendly hotels puerto rico',
   'lodge asheville',
   'pepin wi bed and breakfast',
   'queenstown lodging',
   'whistler bc condo rental',
   'bed and breakfast rockingham',
   'resorts to stay at',
   'pet friendly hotel cashiers nc',
   'newport city inn & suites',
   'lahaina front street hotel',
   'whitefish montana lodges',
   'apex mountain resort']},
 1: {'size': 34,
  'top_vertical': {'activities': 34},
  'top_activity': {'beach': 34},
  'conf_bucket': {'medium': 34},
  'bounded_inference_rate': 1.0,
  'deal_modifier_rate': 0.0,
  'time_modifier_rate': 0.0,
  'sample_queries': ['beach towns brazil',
   'miramar beach crystal view',
   'orlando beach rentals',
   'romantic florida beach vacation',
   'hawaii big island beaches',
   

In [None]:
# df_phase0.to_parquet("sample1k_query_personas.parquet", index=False)
# with open("sample1k_cluster_summaries.json", "w") as f:
#     json.dump(summaries, f, indent=2)
# print("Saved sample1k_query_personas.parquet and sample1k_cluster_summaries.json")

In [None]:
import pandas as pd
import numpy as np
import json

df_out = df_phase0.copy()

print("Rows:", len(df_out))
print("Unique raw queries:", df_out["raw_query"].nunique())
print("Ambiguous rate:", (df_out["ambiguity_bucket"]=="ambiguous").mean().round(3))
print("Non-English rate:", df_out["reason_codes"].apply(lambda s: "RC_NON_ENGLISH" in json.loads(s)).mean().round(3))

# Distribution of verticals
df_out["vertical_primary"].value_counts(dropna=False).head(20)


Rows: 1000
Unique raw queries: 1000
Ambiguous rate: 0.814
Non-English rate: 0.613


Unnamed: 0_level_0,count
vertical_primary,Unnamed: 1_level_1
unknown,814
stays,143
activities,42
flights,1


In [None]:
df_out.shape

(1000, 11)

In [None]:
df_out.head()

Unnamed: 0,raw_query,canonical_query,vertical_primary,activity_type,intent_confidence_score,intent_confidence_label,reason_codes,has_deal_modifier,has_time_modifier,ambiguity_bucket,persona_cluster
0,"hilton head, south carolina","hilton head, south carolina",unknown,,0.2,low,"[""RC_VERTICAL_UNKNOWN""]",0,0,ambiguous,-1
1,grand park cozumel,grand park cozumel,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
2,pet friendly hotels puerto rico,pet friendly hotel puerto rico,stays,,0.8,high,"[""RC_VERTICAL_FROM_REGEX""]",0,0,clear,0
3,airbnb williamstown kentucky,vacation_rental williamstown kentucky,stays,,0.7,high,"[""RC_BOUNDED_INFERENCE"", ""RC_VERTICAL_FROM_REG...",0,0,clear,4
4,gorge campground,gorge campground,unknown,,0.2,low,"[""RC_NON_TRAVEL_AMBIGUITY"", ""RC_VERTICAL_UNKNO...",0,0,ambiguous,-1


In [None]:
df_out[df_out['vertical_primary']=='unknown'].head(25)

Unnamed: 0,raw_query,canonical_query,vertical_primary,activity_type,intent_confidence_score,intent_confidence_label,reason_codes,has_deal_modifier,has_time_modifier,ambiguity_bucket,persona_cluster
0,"hilton head, south carolina","hilton head, south carolina",unknown,,0.2,low,"[""RC_VERTICAL_UNKNOWN""]",0,0,ambiguous,-1
1,grand park cozumel,grand park cozumel,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
4,gorge campground,gorge campground,unknown,,0.2,low,"[""RC_NON_TRAVEL_AMBIGUITY"", ""RC_VERTICAL_UNKNO...",0,0,ambiguous,-1
5,must do in paris,must do in paris,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
6,ireland cottages,ireland cottages,unknown,,0.2,low,"[""RC_NON_TRAVEL_AMBIGUITY"", ""RC_VERTICAL_UNKNO...",0,0,ambiguous,-1
8,ciudad de tulum,ciudad de tulum,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
9,condé nast santa fe hotels,condé nast santa fe hotels,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
10,camping yosemite,camping yosemite,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
11,st john's airbnb,st john's airbnb,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1
12,honeymoon gatlinburg tn,honeymoon gatlinburg tn,unknown,,0.0,low,"[""RC_NON_ENGLISH"", ""RC_LOW_CONFIDENCE_DUE_TO_L...",0,0,ambiguous,-1


In [None]:
df_out.intent_confidence_score.describe()

Unnamed: 0,intent_confidence_score
count,1000.0
mean,0.17725
std,0.280774
min,0.0
25%,0.0
50%,0.0
75%,0.2
max,0.8


In [None]:
import json
from collections import Counter

unknown_vertical_df = df_out[df_out['vertical_primary'] == 'unknown'].copy()

all_reason_codes = []
for codes_json in unknown_vertical_df['reason_codes']:
    try:
        codes = json.loads(codes_json)
        all_reason_codes.extend(codes)
    except json.JSONDecodeError:
        print(f"Could not decode JSON: {codes_json}")

reason_code_counts = Counter(all_reason_codes)

print("Reason code counts for 'unknown' vertical_primary queries:")
display(pd.Series(reason_code_counts).sort_values(ascending=False))

Reason code counts for 'unknown' vertical_primary queries:


Unnamed: 0,0
RC_NON_ENGLISH,613
RC_LOW_CONFIDENCE_DUE_TO_LANGUAGE,613
RC_VERTICAL_UNKNOWN,201
RC_NON_TRAVEL_AMBIGUITY,36


## Recommendation:

* Most ‘unknowns’ are not noise—they’re coverage gaps. The next step is to systematically close those gaps by:
  * Expanding the stays ontology.
  * Introducing a destination-only vertical.
  * Resolving property names using internal catalogs.
  * Instrumenting unknown reasons.
* Once coverage improves, persona clustering becomes much cleaner and directly actionable for landing-page personalization.