In [None]:
import json
import os
from pathlib import Path
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import sys

    

# ----------------------------
# 1) Load data
# ----------------------------

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(repo_root, "src"))



data_path = os.path.join(repo_root, "data", "hotel_toy_dataset_50_en_welcome_style_noisy.csv")
data = pd.read_csv(data_path)
data.dropna(subset=["hotel_name", "address", "landmark", "language"], how="any", inplace=True)
data = data.reset_index(drop=True)


# ----------------------------
# 2) Build index (E5 + FAISS)
# ----------------------------
def hotel_record_text(row):
    return f"{row['hotel_name']} | {row['address']} | {row['landmark']}"

hotel_texts = data.apply(hotel_record_text, axis=1).tolist()

model = SentenceTransformer("intfloat/multilingual-e5-base")

# E5 expects prefixes "passage:" for indexed items and "query:" for search queries
hotel_emb = model.encode(
    [f"passage: {t}" for t in hotel_texts],
    normalize_embeddings=True
).astype("float32")  # normalized + IP => cosine-like scoring :contentReference[oaicite:1]{index=1}

dim = hotel_emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(hotel_emb)


def retrieve_topk(review_text: str, k: int = 3):
    q = model.encode(
        [f"query: {review_text}"],
        normalize_embeddings=True
    ).astype("float32")  # same normalization at query time :contentReference[oaicite:2]{index=2}
    scores, idxs = index.search(q, k)
    return scores[0], idxs[0]


# ----------------------------
# 3) Span localization + output
# ----------------------------
def locate_span(text: str, mention: str):
    """
    Returns (start, end) with end-exclusive offsets, or None if not found.
    str.find returns -1 if not found. :contentReference[oaicite:3]{index=3}
    """
    start = text.find(mention)
    if start == -1:
        return None
    return start, start + len(mention)


def build_outputs_for_doc(doc_id: str, text: str, scores, idxs, df: pd.DataFrame, k: int):
    # 1) top-k candidates (records + score)
    cands = []
    for rank, (score, idx) in enumerate(zip(scores[:k], idxs[:k]), start=1):
        idx = int(idx)
        if idx < 0:  # faiss pads with -1 if not enough results
            continue
        cands.append({
            "rank": rank,
            "row_idx": idx,
            "score": float(score),
            "hotel_name": df.iloc[idx]["hotel_name"],
            "address": df.iloc[idx]["address"],
            "landmark": df.iloc[idx]["landmark"],
        })

    # 2) top-1 -> NER span(s)
    entities_pred = []

    # Minimal version: emit only HOTEL_NAME span from best candidate
    if cands:
        best = cands[0]
        mention = best["hotel_name"]
        span = locate_span(text, mention)
        if span is not None:
            s, e = span
            entities_pred.append({
                "start": s,
                "end": e,
                "type": "HOTEL_NAME",
                "text": mention,
                "score": best["score"],
                "method": "faiss_e5_top1",
            })

    # # (Optional) If you want to ALSO try ADDRESS / LANDMARK spans from that same record:
    for etype, field in [("ADDRESS","address"), ("LANDMARK_POI","landmark")]:
        mention = best[field]
        span = locate_span(text, mention)
        if span is not None:
            s, e = span
            entities_pred.append({"start": s, "end": e, "type": etype, "text": mention, "score": best["score"], "method": "faiss_e5_top1"})
    entities_pred.sort(key=lambda x: (x["start"], x["end"]))

    return {
        "doc_id": doc_id,
        "text": text,
        "entities_pred": entities_pred,   # span-based NER prediction (for eval)
        "candidates_topk": cands,         # top-k candidates (debug/annotation)
    }


# ----------------------------
# 4) Write JSONL
# ----------------------------
out_path = os.path.join(repo_root, "outputs", "retrieval_outputs.jsonl")
os.makedirs(os.path.dirname(out_path), exist_ok=True)

with open(out_path, "w", encoding="utf-8") as f:
    for i, row in data.iterrows():
        doc_id = row.get("id", f"doc_{i:06d}")
        text = str(row["description"])

        scores, idxs = retrieve_topk(text, k=3)

        rec = build_outputs_for_doc(doc_id, text, scores, idxs, data, k=3)
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"Wrote: {out_path}")


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: intfloat/multilingual-e5-base
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Wrote: /Users/ruddigarcia/Projects/ner/outputs/extracted_entities.jsonl
