In [1]:
import pandas as pd 
data = pd.read_csv("/Users/ruddigarcia/Projects/ner/data/hotel_toy_dataset_50_en_welcome_style_noisy.csv")
data.head()
print(data.shape)
data.info()

(50, 6)
<class 'pandas.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   description  50 non-null     str  
 1   language     50 non-null     str  
 2   hotel_name   50 non-null     str  
 3   location     50 non-null     str  
 4   landmark     50 non-null     str  
 5   address      50 non-null     str  
dtypes: str(6)
memory usage: 29.7 KB


In [2]:

# 
data.dropna(subset=["hotel_name","address","landmark","language"], how="any",  inplace=True)
data.info()


<class 'pandas.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   description  50 non-null     str  
 1   language     50 non-null     str  
 2   hotel_name   50 non-null     str  
 3   location     50 non-null     str  
 4   landmark     50 non-null     str  
 5   address      50 non-null     str  
dtypes: str(6)
memory usage: 29.7 KB


In [10]:
import pandas as pd
from rapidfuzz import process, fuzz

def normalize(s: str) -> str:
    return (s or "").strip()

data = pd.read_csv("/Users/ruddigarcia/Projects/ner/data/hotel_toy_dataset_50_en_welcome_style_noisy.csv")
data.dropna(subset=["hotel_name","address","landmark","language"], how="any",  inplace=True)

# Reset index BEFORE creating the hotel_names list so indices align
data = data.reset_index(drop=True)

# Keep hotel_names as a list for matching
hotel_names = data['hotel_name'].fillna("").tolist()
print(hotel_names)

def link_review_to_hotel(review_text: str, score_cutoff: int = 70, top_n: int = 3):
    matches = process.extract(
        query=review_text,
        choices=hotel_names,
        scorer=fuzz.token_set_ratio,
        score_cutoff=score_cutoff,
        limit=top_n
    )
    if not matches:
        return []
    
    # Return just a list of hotel name strings
    return [match_name for match_name, score, idx in matches]

links = data["description"].apply(lambda t: link_review_to_hotel(normalize(t)))

# Expand the list of candidates into separate columns
out = data.copy()
out["candidates"] = links

print(out[["hotel_name", "candidates"]].head(20))

['Krasnapolsky Hotel Amsterdam', 'Rembrandt Square Hotel', 'Grand Plaza Midtown', 'Central Station City Hotel', 'Harbour Bridge Plaza Hotel', 'Rivoli Louvre Hotel', 'Park Plaza City', 'Grand Plaza Termini', 'Ginza Central Hotel', 'Shinjuku City Hotel', 'Bund Riverside Grand Hotel', 'Grand Plaza Taksim', 'Corniche Bay Hotel', 'Old Town Gate Inn', 'Catedral Plaza Hotel', 'Grand Plaza Paulista', 'Mercado Central Hotel', 'Ponte Vecchio Riverside Hotel', 'San Marco Boutique Hotel', 'Alexanderplatz Central Hotel', 'Brandenburger Tor Hotel Berlin', 'Grand Plaza Slussen', 'Gamla Stan Hotel', 'Drottninggatan Hotel', 'Hotel Rynek', 'Grand Plaza Centrum', 'Hotel Most', 'Connaught Plaza Hotel', 'MG Road Hotel', 'Haeundae Grand Plaza', 'Hangang Hotel', 'Tahrir Square Hotel', 'Khalid Airport Grand Plaza', 'Alameda Central Hotel', 'Obelisco Plaza Hotel', 'Jet d’Eau Central Hotel', 'Cathedral Square Suites', 'Union Square Grand Plaza', 'Grand Plaza Airport', 'Victoria Harbour Plaza', 'Millennium Bridg

In [11]:
out.head()

Unnamed: 0,description,language,hotel_name,location,landmark,address,candidates
0,"Welcome to Krasnapolsky Amsterdam, a comfortab...",en,Krasnapolsky Hotel Amsterdam,"Amsterdam, Netherlands",Dam Square,"Damrak 96, 1012 LP Amsterdam",[]
1,"Welcome to Rembrandt Square Hotel, a comfortab...",en,Rembrandt Square Hotel,"Amsterdam, Netherlands",Rembrandtplein,"Amstelstraat 20, 1017 DA Amsterdam",[Rembrandt Square Hotel]
2,"Welcome to Grand Plaza, a comfortable stay wit...",en,Grand Plaza Midtown,"New York, USA",Times Square,"350 W 42nd St, New York, NY 10036",[]
3,"Welcome to Central, a comfortable stay with lu...",en,Central Station City Hotel,"Edinburgh, United Kingdom",Edinburgh Waverley Station,"12 Station Rd, Edinburgh EH1 1BB",[]
4,"Welcome to Harbour Bridge Plaza Hotel, a comfo...",en,Harbour Bridge Plaza Hotel,"Sydney, Australia",Sydney Harbour Bridge,"98 Cumberland St, The Rocks NSW 2000","[Harbour Bridge Plaza Hotel, Victoria Harbour ..."


In [20]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer


# ----------------------------
# 1) Load data
# ----------------------------
data = pd.read_csv("/Users/ruddigarcia/Projects/ner/data/hotel_toy_dataset_50_en_welcome_style_noisy.csv")
data.dropna(subset=["hotel_name", "address", "landmark", "language"], how="any", inplace=True)
data = data.reset_index(drop=True)


# ----------------------------
# 2) Build index (E5 + FAISS)
# ----------------------------
def hotel_record_text(row):
    return f"{row['hotel_name']} | {row['address']} | {row['landmark']}"

hotel_texts = data.apply(hotel_record_text, axis=1).tolist()

model = SentenceTransformer("intfloat/multilingual-e5-base")

# E5 expects prefixes "passage:" for indexed items and "query:" for search queries
hotel_emb = model.encode(
    [f"passage: {t}" for t in hotel_texts],
    normalize_embeddings=True
).astype("float32")  # normalized + IP => cosine-like scoring :contentReference[oaicite:1]{index=1}

dim = hotel_emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(hotel_emb)


def retrieve_topk(review_text: str, k: int = 3):
    q = model.encode(
        [f"query: {review_text}"],
        normalize_embeddings=True
    ).astype("float32")  # same normalization at query time :contentReference[oaicite:2]{index=2}
    scores, idxs = index.search(q, k)
    return scores[0], idxs[0]


# ----------------------------
# 3) Span localization + output
# ----------------------------
def locate_span(text: str, mention: str):
    """
    Returns (start, end) with end-exclusive offsets, or None if not found.
    str.find returns -1 if not found. :contentReference[oaicite:3]{index=3}
    """
    start = text.find(mention)
    if start == -1:
        return None
    return start, start + len(mention)


def build_outputs_for_doc(doc_id: str, text: str, scores, idxs, df: pd.DataFrame, k: int):
    # 1) top-k candidates (records + score)
    cands = []
    for rank, (score, idx) in enumerate(zip(scores[:k], idxs[:k]), start=1):
        idx = int(idx)
        if idx < 0:  # faiss pads with -1 if not enough results
            continue
        cands.append({
            "rank": rank,
            "row_idx": idx,
            "score": float(score),
            "hotel_name": df.iloc[idx]["hotel_name"],
            "address": df.iloc[idx]["address"],
            "landmark": df.iloc[idx]["landmark"],
        })

    # 2) top-1 -> NER span(s)
    entities_pred = []

    # Minimal version: emit only HOTEL_NAME span from best candidate
    if cands:
        best = cands[0]
        mention = best["hotel_name"]
        span = locate_span(text, mention)
        if span is not None:
            s, e = span
            entities_pred.append({
                "start": s,
                "end": e,
                "type": "HOTEL_NAME",
                "text": mention,
                "score": best["score"],
                "method": "faiss_e5_top1",
            })

    # (Optional) If you want to ALSO try ADDRESS / LANDMARK spans from that same record:
    # for etype, field in [("ADDRESS","address"), ("LANDMARK_POI","landmark")]:
    #     mention = best[field]
    #     span = locate_span(text, mention)
    #     if span is not None:
    #         s, e = span
    #         entities_pred.append({"start": s, "end": e, "type": etype, "text": mention, "score": best["score"], "method": "faiss_e5_top1"})
    # entities_pred.sort(key=lambda x: (x["start"], x["end"]))

    return {
        "doc_id": doc_id,
        "text": text,
        "entities_pred": entities_pred,   # span-based NER prediction (for eval)
        "candidates_topk": cands,         # top-k candidates (debug/annotation)
    }


# ----------------------------
# 4) Write JSONL
# ----------------------------
out_path = Path("/Users/ruddigarcia/Projects/ner/data/embed_preds.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)

with out_path.open("w", encoding="utf-8") as f:
    for i, row in data.iterrows():
        doc_id = row.get("id", f"doc_{i:06d}")
        text = str(row["description"])

        scores, idxs = retrieve_topk(text, k=3)

        rec = build_outputs_for_doc(doc_id, text, scores, idxs, data, k=3)
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"Wrote: {out_path}")


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

XLMRobertaModel LOAD REPORT from: intfloat/multilingual-e5-base
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Wrote: /Users/ruddigarcia/Projects/ner/data/embed_preds.jsonl


In [21]:
print(out.iloc[3])

description    Welcome to Central, a comfortable stay with lu...
language                                                      en
hotel_name                            Central Station City Hotel
location                               Edinburgh, United Kingdom
landmark                              Edinburgh Waverley Station
address                         12 Station Rd, Edinburgh EH1 1BB
candidates     [Central Station City Hotel, Cornerstone Inn, ...
Name: 3, dtype: object


In [9]:


import textwrap
from typing import Any, Dict, List, Literal, TypedDict


EntityType = Literal["HOTEL_NAME", "ADDRESS", "LANDMARK_POI"]

class Entity(TypedDict):
    start: int
    end: int
    type: EntityType
    text: str
    # Optional but useful for debugging / provenance
    candidate_id: str

class ExtractEntitiesArgs(TypedDict):
    entities: List[Entity]

class ExtractorPrompt:
   
   
   
    SYSTEM_MESSAGE = textwrap.dedent(
        """
        You are an information extraction engine that performs Named Entity Recognition (NER)
        on {language} hotel descriptions/reviews.

        Entity types:
        - HOTEL_NAME: the property name / brand name used to identify the hotel. Exclude generic "the hotel", "this property".
        - ADDRESS: a findability string (street/canal/square + number, postal code, city/region/country when used as address data).
        - LANDMARK_POI: named attractions, stations, airports, neighborhoods, venues, parks, squares, etc. Exclude generic "the station".

        Boundary rules:
        - Extract the smallest exact substring that uniquely identifies the entity.
        - No overlapping entities (prefer the longest specific span if conflicts exist).
        - Character offsets: start inclusive, end exclusive (Python slicing).
        - Every entity.text MUST equal original_text[start:end].

        Gazetteer hints may be provided. Use them ONLY as hints:
        - Prefer them when they appear verbatim in the text.
        - Do NOT output a hinted candidate unless it appears verbatim in the text.
        - You may still extract entities not in the hints.

        Output MUST be a function call only, matching the JSON schema.

    """
    ).strip()
   
    USER_MESSAGE = textwrap.dedent(
        """
    # Text to process
    {text}
    
            
        
        """).strip()
   
    TOOLS: List[Dict[str, Any]] = [
    {
        "type": "function",
        "name": "extract_hotel_entities",
        "description": "Return named entities (HOTEL_NAME, ADDRESS, LANDMARK_POI) with character offsets.",
        # Structured Outputs guarantees schema adherence when strict=true. :contentReference[oaicite:3]{index=3}
        "strict": True,
        "parameters": {
            "type": "object",
            "properties": {
                "entities": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "start": {"type": "integer"},
                            "end": {"type": "integer"},
                            "type": {"type": "string", "enum": ["HOTEL_NAME", "ADDRESS", "LANDMARK_POI"]},
                            "text": {"type": "string"},
                            "candidate_id": {"type": "string"},
                        },
                        "required": ["start", "end", "type", "text", "candidate_id"],
                        "additionalProperties": False,
                    },
                }
            },
            "required": ["entities"],
            "additionalProperties": False,  # required for Structured Outputs tool schemas :contentReference[oaicite:4]{index=4}
        },
    }
]

    TOOL_CHOICE = {"type": "function", "name": "extract_hotel_entities"}







In [10]:
import json
import os
from typing import Any, Dict, List, Optional
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv(override=True)


api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
            print(
                "No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!"
            )
            
if not api_key.startswith("sk-proj-"):
            print(
                "An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook"
            )
            
        
if api_key.strip() != api_key:
            print(
                "An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook"
            )
            

print("API key found and looks good so far!")








GPT_MODEL = "gpt-5.2-2025-12-11"
openai_client = OpenAI(
                api_key = api_key,
                
            )


def _render(template: str, **kwargs: str) -> str:
    out = template
    for k, v in kwargs.items():
        out = out.replace("{{" + k + "}}", v)
    return out


def _parse_tool_args(resp, tool_name: str) -> Dict[str, Any]:
    # Responses API returns output items; tool calls appear as type=function_call. :contentReference[oaicite:5]{index=5}
    for item in resp.output:
        if item.type == "function_call" and item.name == tool_name:
            return json.loads(item.arguments)
    raise RuntimeError(f"No function_call '{tool_name}' found in response output.")


def _validate_offsets(text: str, entities: List[Dict[str, Any]]) -> None:
    for ent in entities:
        s, e = int(ent["start"]), int(ent["end"])
        if not (0 <= s <= e <= len(text)):
            raise ValueError(f"Invalid offsets: {ent}")
        if text[s:e] != ent["text"]:
            raise ValueError(
                f"Offset mismatch: expected text[{s}:{e}]='{text[s:e]}' got '{ent['text']}'"
            )


def llm_extract(
    text: str,
    
    model: str = "gpt-5.2-2025-12-11",
    client: Optional[OpenAI] = None,
) -> Dict[str, Any]:
    client = client or openai_client

    user = _render(
        ExtractorPrompt.USER_MESSAGE,
        text=text,
        
    )

    resp = client.responses.create(
        model=model,
        input=[{"role": "system", "content": ExtractorPrompt.SYSTEM_MESSAGE},
               {"role": "user", "content": user}],
        tools=ExtractorPrompt.TOOLS,
        tool_choice=ExtractorPrompt.TOOL_CHOICE,          # force the function call :contentReference[oaicite:6]{index=6}
        parallel_tool_calls=False,
    )

    out = _parse_tool_args(resp, "extract_hotel_entities")
    _validate_offsets(text, out.get("entities", []))
    return out

out = llm_extract(
    text="The Grand Budapest Hotel is located at 123 Fictional St, Zubrowka. It's near the famous Zubrowka Park.",
    model=GPT_MODEL,
    client=openai_client,
)
print(json.dumps(out, indent=2))




API key found and looks good so far!
{
  "entities": []
}


In [9]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

text = ("Stay at The Grand Riverside Hotel on King Street, just 5 minutes from "
        "Central Station and near Hyde Park.")

labels = [
    "HOTEL_NAME",
    "ADDRESS",
    "LANDMARK_POI"
]

# higher recall setup
ents = model.predict_entities(
    text,
    labels,
    threshold=0.12,   # start here; tune upward once you see good spans
    flat_ner=False    # maintainer: helps recall (nested enabled)
)

ents = model.predict_entities(text, labels, threshold=0.0, flat_ner=False)
ents = sorted(ents, key=lambda x: x["score"], reverse=True)[:25]
for e in ents:
    print(f'{e["score"]:.3f}', e["label"], repr(e["text"]))




Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

0.185 ADDRESS 'Riverside'
0.175 ADDRESS '.'
0.172 ADDRESS 'at'
0.171 HOTEL_NAME 'Stay'
0.167 ADDRESS 'Hotel'
0.167 ADDRESS 'King Street'
0.165 ADDRESS 'Grand Riverside'
0.164 ADDRESS 'Park'
0.164 ADDRESS 'on King Street'
0.159 ADDRESS 'Grand'
0.153 ADDRESS 'The'
0.147 ADDRESS 'Street'
0.139 ADDRESS 'Hyde Park'
0.138 ADDRESS ', just'
0.138 ADDRESS ', just 5'
0.132 HOTEL_NAME 'The Grand Riverside'
0.124 ADDRESS 'Hyde'
0.123 ADDRESS ', just 5 minutes'
0.122 ADDRESS 'Station'
0.122 ADDRESS 'King'
0.121 HOTEL_NAME 'Stay at'
0.119 ADDRESS 'on'
0.118 HOTEL_NAME 'Hotel on King Street'
0.112 ADDRESS 'minutes'
0.109 ADDRESS ', just 5 minutes from Central Station and near Hyde Park'


In [12]:
entities

[]