In [3]:
# ===============================================================
# Ingredient Search API ‚Äî Single Best Match (Cards & Multi-Vector)
# ===============================================================

import os, json, re, time, unicodedata, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz
from flask import Flask, request, jsonify
from flask_cors import CORS

# Optional FAISS acceleration
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False


# ===============================================================
# CONFIGURATION
# ===============================================================
DEFAULT_MODE = "multivector"  # "cards" or "multivector"

# Input files
CARDS_PATH = "novel_foods_cards.csv"
MULTIV_PATH = "novel_foods_multivectors.csv"

# Model
MODEL_NAME = "sentence-transformers/distiluse-base-multilingual-cased-v2"

# Cache folders
CACHE_DIR = "indices_v2"
os.makedirs(CACHE_DIR, exist_ok=True)

# Cards cache
EMB_CARDS = os.path.join(CACHE_DIR, "emb_cards.npy")
IDX_CARDS = os.path.join(CACHE_DIR, "index_cards.faiss")
LOOKUP_CARDS = os.path.join(CACHE_DIR, "lookup_cards.csv")
META_CARDS = os.path.join(CACHE_DIR, "meta_cards.json")

# Multivector cache
EMB_MULTI = os.path.join(CACHE_DIR, "emb_multi.npy")
IDX_MULTI = os.path.join(CACHE_DIR, "index_multi.faiss")
LOOKUP_MULTI = os.path.join(CACHE_DIR, "lookup_multi.csv")
META_MULTI = os.path.join(CACHE_DIR, "meta_multi.json")

# Search settings
RECALL_K = 200
ALPHA_SEM = 0.75
MIN_CONFIDENCE = 0.50

# Section weighting
SECTION_BOOST = {
    "CANON_LAT": 1.05,
    "CANON_EN": 1.00,
    "SYN_LAT": 0.95,
}


# ===============================================================
# HELPERS
# ===============================================================
def normalize_query_lex(s: str) -> str:
    """Normalize query text for lexical comparison."""
    if not isinstance(s, str):
        return ""
    s = unicodedata.normalize("NFKC", s.casefold())
    s = re.sub(r"[\u2212\u2010-\u2015]", "-", s)
    s = re.sub(r"[^a-z0-9 \-\u00C0-\u017F]", " ", s)
    return re.sub(r"\s+", " ", s).strip()


def ensure_cols(df, cols):
    """Ensure all required columns exist."""
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")


def section_boost(section: str) -> float:
    """Get weighting multiplier for a section."""
    if section in SECTION_BOOST:
        return SECTION_BOOST[section]
    if section.startswith("COMMON_"):
        return 1.00
    return 1.00


# ===============================================================
# LOAD DATA & BUILD INDEX
# ===============================================================
def load_cards():
    df = pd.read_csv(CARDS_PATH, dtype={"policy_item_id": str})
    ensure_cols(df, ["policy_item_id", "canonical", "entity_text"])
    lookup = df[["policy_item_id", "canonical", "entity_text"]].copy()
    return df, lookup


def load_multivectors():
    mv = pd.read_csv(MULTIV_PATH, dtype={"policy_item_id": str})
    ensure_cols(mv, ["policy_item_id", "section", "language", "text"])
    if os.path.exists(CARDS_PATH):
        cards = pd.read_csv(CARDS_PATH, dtype={"policy_item_id": str})
        can_map = cards[["policy_item_id", "canonical"]].drop_duplicates()
    else:
        can_map = pd.DataFrame(columns=["policy_item_id", "canonical"])
    lookup = mv.merge(can_map, on="policy_item_id", how="left")
    return mv, lookup


def build_index(mode="multivector"):
    """Build or load embeddings & FAISS index for given mode."""
    print(f"‚öôÔ∏è Loading model: {MODEL_NAME}")
    model = SentenceTransformer(MODEL_NAME)

    if mode == "cards":
        df, lookup = load_cards()
        texts = df["entity_text"].astype(str).tolist()
        emb_path, idx_path, meta_path, lookup_path = EMB_CARDS, IDX_CARDS, META_CARDS, LOOKUP_CARDS
    else:
        df, lookup = load_multivectors()
        texts = df["text"].astype(str).tolist()
        emb_path, idx_path, meta_path, lookup_path = EMB_MULTI, IDX_MULTI, META_MULTI, LOOKUP_MULTI

    if os.path.exists(emb_path) and os.path.exists(meta_path):
        try:
            meta = json.load(open(meta_path, "r", encoding="utf-8"))
            if meta.get("row_count") == len(texts) and meta.get("model") == MODEL_NAME:
                emb = np.load(emb_path)
                df_lookup = pd.read_csv(lookup_path, dtype={"policy_item_id": str})
                index = None
                if FAISS_AVAILABLE and os.path.exists(idx_path):
                    index = faiss.read_index(idx_path)
                    print(f"‚úÖ FAISS ({mode}) loaded.")
                else:
                    print(f"‚ö†Ô∏è FAISS not installed ‚Äî cosine fallback ({mode}).")
                return model, emb, index, df_lookup
        except Exception:
            print(f"‚ôªÔ∏è Cache mismatch ‚Äî rebuilding {mode} index.")

    print(f"‚öôÔ∏è Encoding {mode} texts‚Ä¶")
    emb = model.encode(texts, show_progress_bar=True, normalize_embeddings=True)
    np.save(emb_path, emb)
    lookup.to_csv(lookup_path, index=False)
    json.dump({"model": MODEL_NAME, "row_count": len(texts)}, open(meta_path, "w"))

    index = None
    if FAISS_AVAILABLE:
        index = faiss.IndexFlatIP(emb.shape[1])
        index.add(np.array(emb, dtype="float32"))
        faiss.write_index(index, idx_path)
        print(f"‚úÖ FAISS ({mode}) built & saved.")
    else:
        print(f"‚ö†Ô∏è FAISS not available ‚Äî using cosine similarity ({mode}).")

    return model, emb, index, lookup


# ===============================================================
# SEARCH
# ===============================================================
def search_best(query, model, emb, index, df_lookup, mode="multivector"):
    """Return single best match (highest blended score)."""
    q_emb = model.encode([query], normalize_embeddings=True)

    if FAISS_AVAILABLE and index is not None:
        scores, idx = index.search(np.array(q_emb, dtype="float32"), min(RECALL_K, len(df_lookup)))
        idx, scores = idx[0], scores[0]
    else:
        sims = cosine_similarity(q_emb, emb)[0]
        idx = np.argsort(sims)[::-1][:min(RECALL_K, len(df_lookup))]
        scores = sims[idx]

    q_norm = normalize_query_lex(query)
    best = None

    for i, s in zip(idx, scores):
        row = df_lookup.iloc[i]
        if mode == "cards":
            text = str(row.get("entity_text", ""))
            section = "CARD"
            lang = ""
        else:
            text = str(row.get("text", ""))
            section = str(row.get("section", ""))
            lang = str(row.get("language", ""))

        canon = str(row.get("canonical", ""))
        lex1 = fuzz.token_set_ratio(q_norm, normalize_query_lex(text)) / 100
        lex2 = fuzz.token_set_ratio(q_norm, normalize_query_lex(canon)) / 100 if canon else 0
        lex = max(lex1, lex2)

        boosted = float(s) * section_boost(section)
        final = ALPHA_SEM * boosted + (1 - ALPHA_SEM) * lex
        if final < MIN_CONFIDENCE:
            continue

        if (best is None) or (final > best["score"]):
            best = {
                "policy_item_id": str(row["policy_item_id"]),
                "canonical": canon,
                "best_text": text,
                "section": section,
                "language": lang,
                "semantic": round(float(s), 3),
                "lexical": round(lex, 3),
                "score": round(final, 3),
            }

    return best or {}


# ===============================================================
# FLASK SERVER
# ===============================================================
app = Flask(__name__)
CORS(app)

print("üöÄ Starting Ingredient Search API...")
model_cards, emb_cards, index_cards, df_cards = build_index("cards")
model_multi, emb_multi, index_multi, df_multi = build_index("multivector")
print("‚úÖ Both indices ready.")


@app.route("/search", methods=["POST"])
def search_api():
    """
    POST {query: 'vitamin a', mode: 'cards'|'multivector'}
    Returns the single best match as JSON.
    """
    data = request.get_json(force=True)
    query = data.get("query", "")
    mode = data.get("mode", DEFAULT_MODE).lower()

    if not query:
        return jsonify({"error": "Missing 'query'"}), 400
    if mode not in ["cards", "multivector"]:
        mode = DEFAULT_MODE

    if mode == "cards":
        best = search_best(query, model_cards, emb_cards, index_cards, df_cards, mode="cards")
    else:
        best = search_best(query, model_multi, emb_multi, index_multi, df_multi, mode="multivector")

    return jsonify(best)


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5001, debug=False)


üöÄ Starting Ingredient Search API...
‚öôÔ∏è Loading model: sentence-transformers/distiluse-base-multilingual-cased-v2




‚öôÔ∏è Encoding cards texts‚Ä¶


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 27/27 [01:17<00:00,  2.88s/it]


‚úÖ FAISS (cards) built & saved.
‚öôÔ∏è Loading model: sentence-transformers/distiluse-base-multilingual-cased-v2




‚úÖ FAISS (multivector) loaded.
‚úÖ Both indices ready.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://172.17.0.2:5001
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [08/Oct/2025 13:09:31] "POST /search HTTP/1.1" 200 -
