In [17]:
import re
import hashlib
import pandas as pd
import sys
import os

# -----------------------
# Load
# -----------------------

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(os.path.join(repo_root, "src"))


data_path = os.path.join(repo_root, "data", "hotel_toy_dataset_50_en_welcome_style_noisy.csv")
df = pd.read_csv(data_path) 
df["text"] = df["description"].astype(str)

# -----------------------
# 1) Length & shape
# -----------------------
_word_re = re.compile(r"\b\w+\b")

df["n_chars"] = df["text"].str.len()
df["n_words"] = df["text"].apply(lambda t: len(_word_re.findall(t)))
df["avg_word_len"] = df.apply(lambda r: (r["n_chars"] / r["n_words"]) if r["n_words"] else 0.0, axis=1)

# -----------------------
# 2) Exact duplicates (normalized hash)
# -----------------------
def norm_text(t: str) -> str:
    t = t.lower().strip()
    t = re.sub(r"\s+", " ", t)
    return t


df["text_norm"] = df["text"].apply(norm_text)
df["is_exact_dup"] = df.duplicated(subset=["text_norm"], keep="first")

# -----------------------
# 3) Entity density proxies (cheap)
# -----------------------
df["has_digit"] = df["text"].str.contains(r"\d", regex=True)
df["comma_count"] = df["text"].str.count(",")
df["punct_density"] = df["text"].str.count(r"[,:;.\-()]") / df["n_chars"].clip(lower=1)

# crude English-only proxy: ratio of tokens that start with uppercase
def cap_ratio(t: str) -> float:
    toks = _word_re.findall(t)
    if not toks:
        return 0.0
    caps = sum(1 for w in toks if w[:1].isupper())
    return caps / len(toks)

df["cap_word_ratio"] = df["text"].apply(cap_ratio)

# -----------------------
# 4) OPTIONAL: list-coverage proxy with Aho–Corasick
# (fast multi-pattern exact matching)  :contentReference[oaicite:2]{index=2}
# -----------------------
USE_LISTS = True  # set True if you have hotel_names list etc.

if USE_LISTS:
    from flashtext import KeywordProcessor  # pip install flashtext

    def build_processor(strings):
        processor = KeywordProcessor()
        for s in strings:
            s = (s or "").strip()
            if not s:
                continue
            processor.add_keyword(s)
        return processor

    # load your lists (example)
    hotel_names = df.hotel_name.dropna().unique().tolist()
    poi_names = df.landmark.dropna().unique().tolist()
    addr_strings = df.address.dropna().unique().tolist()

    proc_hotel = build_processor(hotel_names)
    proc_poi = build_processor(poi_names)
    proc_addr = build_processor(addr_strings)

    def has_any(processor, text: str) -> bool:
        return len(processor.extract_keywords(text)) > 0

    df["has_exact_hotel"] = df["text"].apply(lambda t: has_any(proc_hotel, t))
    df["has_exact_poi"] = df["text"].apply(lambda t: has_any(proc_poi, t))
    df["has_exact_addr"] = df["text"].apply(lambda t: has_any(proc_addr, t))

# -----------------------
# Summary stats you actually use to sample
# -----------------------
summary = {
    "rows": len(df),
    "dup_rate": float(df["is_exact_dup"].mean()),
    "n_words_p50": float(df["n_words"].quantile(0.50)),
    "n_words_p90": float(df["n_words"].quantile(0.90)),
    "has_digit_rate": float(df["has_digit"].mean()),
}
print(summary)

# Save profile output for sampling / plotting
out_path = os.path.join(repo_root, "outputs", "hotel_toy_dataset_50_en_profile.parquet")
df.drop(columns=["text_norm"], errors="ignore").to_parquet(out_path, index=False)
print("Wrote:", out_path)


{'rows': 50, 'dup_rate': 0.0, 'n_words_p50': 78.5, 'n_words_p90': 85.1, 'has_digit_rate': 1.0}
Wrote: /Users/ruddigarcia/Projects/ner/outputs/hotel_toy_dataset_50_en_profile.parquet


In [15]:
df.head()

Unnamed: 0,description,language,hotel_name,location,landmark,address,text,n_chars,n_words,avg_word_len,text_norm,is_exact_dup,has_digit,comma_count,punct_density,cap_word_ratio,has_exact_hotel,has_exact_poi,has_exact_addr
0,"Welcome to Krasnapolsky Amsterdam, a comfortab...",en,Krasnapolsky Hotel Amsterdam,"Amsterdam, Netherlands",Dam Square,"Damrak 96, 1012 LP Amsterdam","Welcome to Krasnapolsky Amsterdam, a comfortab...",502,81,6.197531,"welcome to krasnapolsky amsterdam, a comfortab...",False,True,9,0.035857,0.17284,False,True,True
1,"Welcome to Rembrandt Square Hotel, a comfortab...",en,Rembrandt Square Hotel,"Amsterdam, Netherlands",Rembrandtplein,"Amstelstraat 20, 1017 DA Amsterdam","Welcome to Rembrandt Square Hotel, a comfortab...",504,81,6.222222,"welcome to rembrandt square hotel, a comfortab...",False,True,9,0.031746,0.17284,True,True,True
2,"Welcome to Grand Plaza, a comfortable stay wit...",en,Grand Plaza Midtown,"New York, USA",Times Square,"350 W 42nd St, New York, NY 10036","Welcome to Grand Plaza, a comfortable stay wit...",420,74,5.675676,"welcome to grand plaza, a comfortable stay wit...",False,True,9,0.035714,0.216216,False,True,True
3,"Welcome to Central, a comfortable stay with lu...",en,Central Station City Hotel,"Edinburgh, United Kingdom",Edinburgh Waverley Station,"12 Station Rd, Edinburgh EH1 1BB","Welcome to Central, a comfortable stay with lu...",487,80,6.0875,"welcome to central, a comfortable stay with lu...",False,True,9,0.034908,0.1875,False,True,True
4,"Welcome to Harbour Bridge Plaza Hotel, a comfo...",en,Harbour Bridge Plaza Hotel,"Sydney, Australia",Sydney Harbour Bridge,"98 Cumberland St, The Rocks NSW 2000","Welcome to Harbour Bridge Plaza Hotel, a comfo...",505,82,6.158537,"welcome to harbour bridge plaza hotel, a comfo...",False,True,9,0.033663,0.231707,True,True,True


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split



# 1) bucket text length (quantile bins)
df["len_bin"] = pd.qcut(df["n_words"], q=5, labels=False, duplicates="drop")

# 2) build a single stratification label (string key)
df["stratum"] = (
    df["len_bin"].astype(str) + "|" +
    df["has_digit"].astype(int).astype(str) + "|" +
    df["has_exact_hotel"].astype(int).astype(str)
)



# 3) stratified split
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=df["stratum"],  
)

print(len(train_df), len(test_df))



40 10


In [19]:
train_df.head()

Unnamed: 0,description,language,hotel_name,location,landmark,address,text,n_chars,n_words,avg_word_len,...,is_exact_dup,has_digit,comma_count,punct_density,cap_word_ratio,has_exact_hotel,has_exact_poi,has_exact_addr,len_bin,stratum
40,"Welcome to Millennium Bridge Hotel, a comforta...",en,Millennium Bridge Hotel,"London, United Kingdom",Millennium Bridge,"2 River Walk, London SE1 9PP","Welcome to Millennium Bridge Hotel, a comforta...",443,75,5.906667,...,False,True,9,0.036117,0.2,True,True,True,1,1|1|1
44,"Welcome to Central Harbor Hotel, a comfortable...",en,Central Harbor Hotel,"Auckland, New Zealand",Viaduct Harbour,"15 Quay St, Auckland 1010","Welcome to Central Harbor Hotel, a comfortable...",474,78,6.076923,...,False,True,9,0.035865,0.192308,True,True,True,2,2|1|1
2,"Welcome to Grand Plaza, a comfortable stay wit...",en,Grand Plaza Midtown,"New York, USA",Times Square,"350 W 42nd St, New York, NY 10036","Welcome to Grand Plaza, a comfortable stay wit...",420,74,5.675676,...,False,True,9,0.035714,0.216216,False,True,True,1,1|1|0
38,"Welcome to Grand Plaza, a comfortable stay wit...",en,Grand Plaza Airport,"Miami, USA",Miami International Airport,"1 Airport Blvd, Miami, FL 33126","Welcome to Grand Plaza, a comfortable stay wit...",498,82,6.073171,...,False,True,9,0.032129,0.195122,False,True,True,3,3|1|0
49,"Welcome to Rembrandtplein Suites, a comfortabl...",en,Rembrandtplein Suites Hotel,"Amsterdam, Netherlands",Rembrandtplein,"1 Amstelstraat, 1017 DA Amsterdam","Welcome to Rembrandtplein Suites, a comfortabl...",437,66,6.621212,...,False,True,8,0.032037,0.181818,False,True,True,0,0|1|0
