# 02_xd_vector_generation.ipynb

Build a dataset-grounded **lexicon** (Taste/Aroma/Texture) from
`dish_name_to_be_processed` and generate the **XD flavor vectors**.

Inputs: `project_x/data_cleaned/user_orders_clean.csv`
Outputs: `lexicon.yaml`, `xd_vectors_32bit.csv`, `xd_report.txt` in `data_cleaned/`.


In [1]:
from pathlib import Path
import re, sys, unicodedata, yaml
import pandas as pd
import numpy as np
from collections import Counter

cwd = Path.cwd()
if cwd.name == 'notebooks' and (cwd.parent / 'data_cleaned').exists():
    ROOT = cwd.parent
else:
    search = cwd
    ROOT = None
    for _ in range(5):
        if (search / 'data_cleaned').exists():
            ROOT = search
            break
        search = search.parent
    if ROOT is None:
        ROOT = cwd

DATA_CLEANED = ROOT / 'data_cleaned'
INPUT_CSV = DATA_CLEANED / 'user_orders_clean.csv'
LEXICON_YAML = DATA_CLEANED / 'lexicon.yaml'
XD_CSV = DATA_CLEANED / 'xd_vectors_32bit.csv'
REPORT = DATA_CLEANED / 'xd_report.txt'

print('ROOT:', ROOT)
print('INPUT_CSV:', INPUT_CSV)
assert INPUT_CSV.exists(), f"Missing {INPUT_CSV}."


ModuleNotFoundError: No module named 'yaml'

In [None]:
df = pd.read_csv(INPUT_CSV)
assert 'dish_name_to_be_processed' in df.columns, "Expected 'dish_name_to_be_processed' column."
print('Rows:', len(df))
df[['dish_name_to_be_processed']].head(5)


In [None]:
def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c))

def tokenize(text: str):
    t = strip_accents(str(text).lower())
    t = re.sub(r"[^a-z\s]", " ", t)
    return [w for w in t.split() if w]

all_tokens, rows_tokens = [], []
for s in df['dish_name_to_be_processed'].fillna(''):
    toks = tokenize(s)
    rows_tokens.append(toks)
    all_tokens.extend(toks)
token_freq = Counter(all_tokens)
print('Unique tokens:', len(token_freq))
pd.DataFrame(token_freq.most_common(30), columns=['token','count']).head(10)


In [None]:
# Seed â†’ restrict to dataset tokens only 
seed_taste = {
    'sweet': {'sweet','sugar','chocolate','caramel','honey','cake','milkshake','lemonade','cookie','brownie','icing','frosting'},
    'spicy': {'spicy','hot','chili','jalapeno','pepper','hotshot'},
    'sour':  {'tamarind','lemon','lime','sour','tangy','vinegar'},
    'salty': {'salt','salty','fries','chips'},
    'umami': {'umami','cheese','cheesy','cheesesteak','burger','beef','chicken','gravy','mushroom','soy','fish','nuggets','biryani'},
    'bitter':{'bitter','coffee','dark'},
}
seed_aroma = {
    'garlic':  {'garlic','onion'},
    'buttery': {'butter','buttery','creamy','milk'},
    'smoky':   {'smoky','smoked','bbq','grilled'},
    'spiced':  {'pepper','curry','masala','manchurian'},
    'sweet_aroma': {'vanilla','chocolate'},
    'citrus':  {'lemon','lime','orange','peach','wildberry'},
}
seed_texture = {
    'crispy': {'crispy','crunchy','fried','fries','chips','nuggets','rings'},
    'creamy': {'creamy','sauce','gravy','milk','cheese'},
    'soft':   {'soft','mashed','bun','bread'},
    'chewy':  {'chewy'},
}

token_set = set(token_freq)
def restrict(seed):
    out = {}
    for k,v in seed.items():
        keep = sorted(set(v) & token_set)
        if keep:
            out[k] = keep
    return out

lexicon = {'taste': restrict(seed_taste), 'aroma': restrict(seed_aroma), 'texture': restrict(seed_texture)}
with open(ROOT / 'data_cleaned' / 'lexicon.yaml', 'w') as f:
    yaml.safe_dump(lexicon, f, sort_keys=False)
lexicon


In [None]:
# Build binary XD vector
xd_dims = []
for cat, subs in lexicon.items():
    for sub in subs:
        xd_dims.append(f"{cat}_{sub}")

def score(tokens):
    s = {d:0 for d in xd_dims}
    ts = set(tokens)
    for cat, subs in lexicon.items():
        for sub, words in subs.items():
            if ts & set(words):
                s[f"{cat}_{sub}"] = 1
    return s

rows = [score(t) for t in rows_tokens]
xd_df = pd.DataFrame(rows, dtype=np.float32)
xd_df.index = df.index
out = pd.concat([df[['dish_name_to_be_processed']], xd_df], axis=1)
out.to_csv(XD_CSV, index=False)
out.head(10)


In [None]:
# Report
lines = []
lines.append(f"Rows: {len(df)}")
lines.append(f"XD dims: {len(xd_dims)}")
lines.append("\nFirst 10 rows with positive dims:")
for i, r in out.head(10).iterrows():
    pos = [d for d in xd_dims if int(r.get(d,0))==1]
    lines.append(f" - {r['dish_name_to_be_processed']} => {pos}")

Path(REPORT).write_text("\n".join(lines))
print('Wrote', REPORT)
