In [10]:
import pandas as pd
from flashtext import KeywordProcessor
import sys
import os

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(repo_root, "src"))



data_path = os.path.join(repo_root, "data", "hotel_toy_dataset_50_en_welcome_style_noisy.csv")
data = pd.read_csv(data_path)

data.dropna(subset=["hotel_name", "address", "landmark", "language"], how="any", inplace=True)
data = data.reset_index(drop=True)

def normalize(s: str) -> str:
    return (s or "").strip()

# ---------------------------------------------------------------------
# Build one KeywordProcessor per field we want to NER-tag
# ---------------------------------------------------------------------
FIELDS = ["hotel_name", "location", "landmark", "address"]
kp_by_field: dict[str, KeywordProcessor] = {}

for field in FIELDS:
    kp = KeywordProcessor(case_sensitive=False)  # exact string, but case-insensitive
    # use unique, non-empty values
    for val in data[field].dropna().unique():
        val_norm = normalize(val)
        if val_norm:
            kp.add_keyword(val_norm)  # clean_name = original text
    kp_by_field[field] = kp


# ---------------------------------------------------------------------
# Link description → exact spans for all fields
# ---------------------------------------------------------------------
def link_review_to_entities_flashtext(review_text: str):
    text = normalize(review_text)
    # return empty lists per field if nothing to search
    result = {field: [] for field in FIELDS}
    if not text:
        return result

    for field, kp in kp_by_field.items():
        # span_info=True → (matched_text, start, end)
        matches = kp.extract_keywords(text, span_info=True)
        if not matches:
            continue

        for matched_text, start, end in matches:
            result[field].append(
                {
                    "text": matched_text,
                    "start": start,  # inclusive
                    "end": end,      # exclusive
                }
            )

    return result





In [11]:
out = data.copy()
out["candidates"] = out["description"].apply(link_review_to_entities_flashtext)

out[["hotel_name", "description", "candidates"]].head(10)


Unnamed: 0,hotel_name,description,candidates
0,Krasnapolsky Hotel Amsterdam,"Welcome to Krasnapolsky Amsterdam, a comfortab...","{'hotel_name': [], 'location': [], 'landmark':..."
1,Rembrandt Square Hotel,"Welcome to Rembrandt Square Hotel, a comfortab...",{'hotel_name': [{'text': 'Rembrandt Square Hot...
2,Grand Plaza Midtown,"Welcome to Grand Plaza, a comfortable stay wit...","{'hotel_name': [], 'location': [], 'landmark':..."
3,Central Station City Hotel,"Welcome to Central, a comfortable stay with lu...","{'hotel_name': [], 'location': [], 'landmark':..."
4,Harbour Bridge Plaza Hotel,"Welcome to Harbour Bridge Plaza Hotel, a comfo...",{'hotel_name': [{'text': 'Harbour Bridge Plaza...
5,Rivoli Louvre Hotel,"Welcome to Rivoli Louvre, a comfortable stay w...","{'hotel_name': [], 'location': [], 'landmark':..."
6,Park Plaza City,"Welcome to Plaza, a comfortable stay with park...","{'hotel_name': [], 'location': [], 'landmark':..."
7,Grand Plaza Termini,"Welcome to Grand Plaza Termini, a comfortable ...",{'hotel_name': [{'text': 'Grand Plaza Termini'...
8,Ginza Central Hotel,"Welcome to Ginza Central Hotel, a comfortable ...",{'hotel_name': [{'text': 'Ginza Central Hotel'...
9,Shinjuku City Hotel,"Welcome to Shinjuku City Hotel, a comfortable ...",{'hotel_name': [{'text': 'Shinjuku City Hotel'...


In [13]:
print(out.iloc[7].candidates)
print(out.iloc[7].description)

{'hotel_name': [{'text': 'Grand Plaza Termini', 'start': 11, 'end': 30}], 'location': [], 'landmark': [{'text': 'Roma Termini', 'start': 168, 'end': 180}], 'address': [{'text': 'Via Marsala 25, 00185 Rome', 'start': 116, 'end': 142}]}
Welcome to Grand Plaza Termini, a comfortable stay with fast check-in, baggage hold, and late breakfast. Find us at Via Marsala 25, 00185 Rome, Rome. Just minutes from Roma Termini and popular local cafés, shops, and evening spots. If you’re arriving early, luggage storage is available until your room is ready. A small lounge makes a good spot to unwind after sightseeing. Transport tips: the front desk can arrange a taxi, and the closest train station is walkable.
