# 02b_llm_descriptions_only_v2.ipynb

Generate **dish_description** (no scoring) for each row using OpenAI, based on
`dish_name`, `dish_name_to_be_processed`, `restaurant_name`, and `city_name`.

### Inputs
- `project_x/data_cleaned/user_orders_clean.csv`

### Outputs
- `project_x/data_cleaned/user_orders_clean_with_description.csv`
- Cache (versioned): `project_x/data_cleaned/llm_run_cache__{PROMPT_VERSION}.jsonl`
- Coverage summary printed at the end

**Note:** Set the `OPENAI_API_KEY` in your environment (or use a `.env`).

In [2]:
# Optional: one-time installs (uncomment if needed)
# %pip install openai==1.51.0 pandas numpy tqdm tenacity python-dotenv
from pathlib import Path
import os, json, time, hashlib
from typing import Dict, Any
import pandas as pd
import numpy as np
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential
from openai import OpenAI
from dotenv import load_dotenv

# Load .env if present
load_dotenv()

# Project root autodetect
cwd = Path.cwd()
if cwd.name == 'notebooks' and (cwd.parent / 'data_cleaned').exists():
    ROOT = cwd.parent
else:
    search = cwd
    ROOT = None
    for _ in range(5):
        if (search / 'data_cleaned').exists():
            ROOT = search
            break
        search = search.parent
    if ROOT is None:
        ROOT = cwd

DATA_CLEANED = ROOT / 'data_cleaned'
INPUT_CSV = DATA_CLEANED / 'user_orders_clean.csv'
OUTPUT_CSV = DATA_CLEANED / 'user_orders_clean_with_description.csv'

print('ROOT:', ROOT)
print('INPUT_CSV:', INPUT_CSV)
print('OUTPUT_CSV:', OUTPUT_CSV)
assert INPUT_CSV.exists(), f"Missing input CSV at {INPUT_CSV}"

# OpenAI client
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
assert OPENAI_API_KEY, 'OPENAI_API_KEY not set. Use a .env or export the variable.'
MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')
TEMPERATURE = float(os.getenv('OPENAI_TEMPERATURE', '0.2'))
BATCH_SIZE = 10
ROW_LIMIT = int(os.getenv('ROW_LIMIT', '0'))  # 0 means no explicit limit
CHECKPOINT_INTERVAL = 50

client = OpenAI(api_key=OPENAI_API_KEY)

ROOT: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x
INPUT_CSV: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean.csv
OUTPUT_CSV: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean_with_description.csv


In [3]:
# === Prompt versioning & helpers ===
PROMPT_VERSION = 'flavor-v2.3'  # bump this when you change rules

def sha256(text: str) -> str:
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

# Versioned cache file per prompt version
CACHE_PATH = DATA_CLEANED / f"llm_run_cache__{PROMPT_VERSION}.jsonl"
print('CACHE_PATH:', CACHE_PATH)

CACHE_PATH: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/llm_run_cache__flavor-v2.3.jsonl


In [4]:
# === System prompt (v2) with salt/version header ===
SYSTEM_PROMPT = (
    "You are a culinary analyst. Given basic fields about a menu item, produce a single JSON object with EXACTLY one key: 'description'. "
    "The 'description' must be 40–65 words and ALWAYS include all three sensory dimensions — taste, aroma, and texture. "
    "Use clear, natural language to create one compact sentence. \n\n"
    "• Taste cues: sweet, salty, sour, bitter, umami/savory, spicy/heat, peppery, tangy, bittersweet, smoky-sweet, honeyed, caramelized. \n"
    "• Aroma cues: go beyond 'garlicky/buttery/smoky/citrusy'; also use oniony, nutty, toasty, roasted, charred, woody, earthy, floral, herby/herbal "
    "(basil, oregano, thyme, rosemary, dill, mint), warm-spice (cumin, coriander, fennel, anise, clove, cardamom, cinnamon), sesame, peanutty, coconutty, lemongrass, kaffir-lime, vinegar-sharp. \n"
    "• Texture cues: crispy, crunchy, crackly, airy, flaky, tender, succulent/juicy, springy, bouncy, toothsome/al dente, silky/smooth/velvety, thick, coarse, crumbly, sticky, gooey, saucy, glaze-coated. \n\n"
    "Evenness requirement: Each description MUST contain at least one cue from EACH of taste, aroma, and texture. "
    "Prefer 1–3 cues per dimension, and vary vocabulary across items so the same aroma term isn’t reused excessively. "
    "If exact properties are uncertain, infer typical ones from cuisine, ingredients, or cooking method using hedges like 'likely', 'typically', or 'commonly'. \n\n"
    "Rules: Use only generally known culinary knowledge; do not invent brand-specific or proprietary details. "
    "Keep tone neutral, concise, and cuisine-aware. Mention sides or serving style only if space allows after all three sensory dimensions are covered. \n\n"
    "Output rules: Return ONLY valid JSON with a single key 'description' and a single string value. No markdown, no extra keys, no comments. "
    "The string must be one grammatical sentence that explicitly includes at least one taste cue, one aroma cue, and one texture cue as defined above. "
    "Avoid overusing 'garlicky', 'buttery', 'smoky', or 'citrusy' unless most natural; use broader, varied vocabulary."
)

# prepend version tag to ensure upstream caches miss
SYSTEM_PROMPT = f"[{PROMPT_VERSION}]\n" + SYSTEM_PROMPT.strip()
print('SYSTEM_PROMPT hash:', sha256(SYSTEM_PROMPT)[:12])

SYSTEM_PROMPT hash: 6c3a13ed591a


In [5]:
# === Validator: every description must have taste + aroma + texture ===
TASTE  = {"sweet","salty","sour","bitter","umami","savory","spicy","heat","peppery","tangy","bittersweet","smoky","smoky-sweet","caramelized","honeyed"}
AROMA  = {"garlic","garlicky","onion","oniony","butter","buttery","nutty","toasty","roasted","charred","woody","earthy","floral",
          "herb","herbal","basil","oregano","thyme","rosemary","dill","mint","cumin","coriander","fennel","anise","clove","cardamom",
          "cinnamon","sesame","peanut","peanutty","coconut","coconutty","lemongrass","kaffir","lime","vinegar","fragrant","aroma","paprika","ginger","turmeric","curry"}
TEXTURE= {"crispy","crunchy","crackly","airy","flaky","tender","juicy","succulent","springy","bouncy","toothsome","al dente",
          "silky","smooth","velvety","thick","coarse","crumbly","sticky","gooey","saucy","glaze","glaze-coated","creamy","chewy"}

def _has_any(text: str, vocab: set) -> bool:
    t = (text or '').lower()
    return any(v in t for v in vocab)

def valid_description(desc: str) -> bool:
    return _has_any(desc, TASTE) and _has_any(desc, AROMA) and _has_any(desc, TEXTURE)

In [6]:
# === Data load ===
df = pd.read_csv(INPUT_CSV)
needed = ['dish_name','dish_name_to_be_processed','restaurant_name','city_name']
for c in needed:
    assert c in df.columns, f"Missing column: {c}"

if 'dish_description' not in df.columns:
    df['dish_description'] = np.nan

print('Rows in CSV:', len(df))
display(df[needed].head(3))

Rows in CSV: 382


Unnamed: 0,dish_name,dish_name_to_be_processed,restaurant_name,city_name
0,Chicken shawarma gyro,chicken shawarma spicy sauce,Gyro Kingdom (NE Davis),Portland
1,Crispy Fries,crispy fries,Gyro Kingdom (NE Davis),Portland
2,Chicken Shawarma Plate,chicken shawarma plate,Gyro Kingdom (NE Davis),Portland


In [7]:
# === Cache loader (versioned) ===
cache = {}
if CACHE_PATH.exists():
    with open(CACHE_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                obj = json.loads(line)
                cache[obj['cache_key']] = obj
            except Exception:
                pass
print('Loaded cache items:', len(cache))

def make_cache_key(row: pd.Series, system_prompt: str) -> str:
    base = {
        'v': PROMPT_VERSION,
        'model': MODEL,
        'system_hash': sha256(system_prompt),
        'dish_name': str(row['dish_name']),
        'dish_name_to_be_processed': str(row['dish_name_to_be_processed']),
        'restaurant_name': str(row['restaurant_name']),
        'city_name': str(row['city_name'])
    }
    return json.dumps(base, sort_keys=True, ensure_ascii=False)

Loaded cache items: 0


In [8]:
# === Prompt builder & parsing ===
def build_user_prompt(row: pd.Series) -> str:
    payload = {
        'dish_name': str(row['dish_name']),
        'dish_name_to_be_processed': str(row['dish_name_to_be_processed']),
        'restaurant_name': str(row['restaurant_name']),
        'city_name': str(row['city_name'])
    }
    return (
        "Use the following fields to infer a realistic culinary description.\n" +
        json.dumps(payload, ensure_ascii=False)
    )

def safe_parse_json(text: str) -> Dict[str, Any]:
    try:
        obj = json.loads(text)
        if isinstance(obj, dict) and 'description' in obj and isinstance(obj['description'], str):
            return obj
    except Exception:
        pass
    return {'description': ''}

In [9]:
# === OpenAI call with retry ===
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=30), reraise=True)
def call_llm(system_prompt: str, user_prompt: str) -> Dict[str, Any]:
    resp = client.chat.completions.create(
        model=MODEL,
        temperature=TEMPERATURE,
        messages=[{"role":"system","content":system_prompt}, {"role":"user","content":user_prompt}],
    )
    text = resp.choices[0].message.content
    return safe_parse_json(text)

In [10]:
# === Worklist: only rows with missing/empty description (or limit via ROW_LIMIT) ===
needs = df[(df['dish_description'].isna()) | (df['dish_description'].astype(str).str.len() == 0)].copy()
if ROW_LIMIT > 0:
    needs = needs.head(ROW_LIMIT)
rows = needs.index.tolist()
print(f"Planned rows this run: {len(rows)} (of {len(df)})")

Planned rows this run: 382 (of 382)


In [11]:
# === Main loop: cache-aware + validator + salted retries ===
processed = 0
DATA_CLEANED.mkdir(parents=True, exist_ok=True)
with open(CACHE_PATH, 'a', encoding='utf-8') as cache_f:
    for i in tqdm(range(0, len(rows), BATCH_SIZE)):
        batch_idx = rows[i:i+BATCH_SIZE]
        for idx in batch_idx:
            row = df.loc[idx]
            key = make_cache_key(row, SYSTEM_PROMPT)
            result = None

            # 1) Try cache first, but re-validate
            cached = cache.get(key)
            if cached:
                cand = (cached.get('result') or {}).get('description', '')
                if cand and valid_description(cand):
                    result = cached['result']

            # 2) If no valid cache, generate with up to 3 salted retries
            if result is None:
                up = build_user_prompt(row)
                result = {'description': ''}
                for attempt in range(1, 4):
                    try:
                        salted = SYSTEM_PROMPT + f"\n[retry_hint={attempt}:{int(time.time())%100000}]"
                        r = call_llm(salted, up)
                        desc_try = (r.get('description') or '').strip()
                        if valid_description(desc_try):
                            result = r
                            break
                    except Exception:
                        pass

                cache_line = {'cache_key': key, 'result': result}
                cache_f.write(json.dumps(cache_line, ensure_ascii=False) + "\n")
                cache[key] = cache_line

            # 3) Assign if present
            desc = (result.get('description') or '').strip()
            if desc:
                df.at[idx, 'dish_description'] = desc

        processed += len(batch_idx)
        # periodic checkpoint
        if processed % CHECKPOINT_INTERVAL == 0:
            df.to_csv(OUTPUT_CSV, index=False)
            print('Checkpoint saved at', processed)

print('Processed rows:', processed)
df.to_csv(OUTPUT_CSV, index=False)
print('Final CSV written to:', OUTPUT_CSV)

  df.at[idx, 'dish_description'] = desc
 13%|█▎        | 5/39 [02:00<13:30, 23.83s/it]

Checkpoint saved at 50


 26%|██▌       | 10/39 [03:25<08:49, 18.25s/it]

Checkpoint saved at 100


 38%|███▊      | 15/39 [04:53<07:02, 17.60s/it]

Checkpoint saved at 150


 51%|█████▏    | 20/39 [06:28<05:53, 18.58s/it]

Checkpoint saved at 200


 64%|██████▍   | 25/39 [08:02<04:20, 18.59s/it]

Checkpoint saved at 250


 77%|███████▋  | 30/39 [09:37<02:46, 18.46s/it]

Checkpoint saved at 300


 90%|████████▉ | 35/39 [11:24<01:23, 20.75s/it]

Checkpoint saved at 350


100%|██████████| 39/39 [12:40<00:00, 19.50s/it]

Processed rows: 382
Final CSV written to: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean_with_description.csv





In [12]:
# === Coverage summary (great for slides) ===
def _has_any_series(series: pd.Series, vocab: set) -> int:
    return series.fillna('').str.lower().apply(lambda s: any(v in s for v in vocab)).sum()

total = len(df)
taste_n   = _has_any_series(df['dish_description'], TASTE)
aroma_n   = _has_any_series(df['dish_description'], AROMA)
texture_n = _has_any_series(df['dish_description'], TEXTURE)
all_three = df['dish_description'].fillna('').str.lower().apply(valid_description).sum()

summary = {
    'rows': total,
    'taste≥1': int(taste_n),
    'aroma≥1': int(aroma_n),
    'texture≥1': int(texture_n),
    'all_three': int(all_three),
    'all_three_pct': round(100*all_three/total, 1) if total else 0.0,
}
print('Coverage:', summary)
summary

Coverage: {'rows': 382, 'taste≥1': 363, 'aroma≥1': 363, 'texture≥1': 363, 'all_three': 363, 'all_three_pct': 95.0}


{'rows': 382,
 'taste≥1': 363,
 'aroma≥1': 363,
 'texture≥1': 363,
 'all_three': 363,
 'all_three_pct': 95.0}

## Optional: Clean `dish_description` to `dish_description_clean`
This section mirrors your cleaning helpers so you can create a cleaned column and write `final.csv`.

In [13]:
import regex, unicodedata
from typing import Set, Final, Pattern
from tqdm.auto import tqdm as tqdm_auto

tqdm_auto.pandas(desc="Cleaning progress")

def strip_accents(text: str) -> str:
    try:
        nfkd_form = unicodedata.normalize("NFKD", str(text))
        return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
    except TypeError:
        return ""

RE_PAREN: Final[Pattern] = regex.compile(r"[\(\[].*?[\)\]]")
NORMALIZE_MAP: Final[dict[str, str]] = {"w/": " with ", "&": " and "}
RE_QUANTITY: Final[Pattern] = regex.compile(
    r"(\b(serves?|feeds?)\s*\d+\b|\b\d+\s*for\s*\$?\d+(\.\d+)?\b|\b(x\d+|\d+x)\b|\b\d+(\.\d+)?\s*(pc|pcs|ct|oz|lb|lbs|kg|g|ml|l|inch|in|'|\")s?\b|\b\d+(\.\d+)?s?\b)",
    flags=regex.IGNORECASE,
)
RE_INSTRUCTION: Final[Pattern] = regex.compile(
    r"(\b(no|without|w/o|extra|add|added|light|less|more|sub|swap)\s+[\p{L}]+\b|\bon\s+the\s+side\b)",
    flags=regex.IGNORECASE,
)
MARKETING_BLOCKLIST: Final[Set[str]] = {
    "appetizer","add","added","box","bucket","combo","ct","deal","deluxe","dessert","double","entree","family",
    "feeds","for","g","in","inch","jumbo","junior","kids","kg","l","large","lb","lbs","meal","medium","mini",
    "ml","oz","pack","party","pc","pcs","platter","regular","serves","single","small","special","starter","tray",
    "triple","value","xl","xxl","side"
}
RE_MARKETING: Final[Pattern] = regex.compile(r"\b(" + r"|".join(MARKETING_BLOCKLIST) + r")\b", flags=regex.IGNORECASE)
RE_FINAL_SYMBOLS: Final[Pattern] = regex.compile(r"[^a-z\s]")
DOMAIN_1CHAR_EXCEPTIONS: Final[Set[str]] = {"bbq"}
RE_CAPS_INSTRUCTION: Final[Pattern] = regex.compile(r"\b([A-Z]{2,}\s+){1,}[A-Z]{2,}\b")

def _final_token_cleanup(text: str) -> str:
    if not text:
        return ""
    tokens = text.split()
    final_tokens = []
    last_token = None
    for token in tokens:
        if token == last_token:
            continue
        if len(token) > 1 or token in DOMAIN_1CHAR_EXCEPTIONS:
            final_tokens.append(token)
        last_token = token
    return " ".join(final_tokens)

def build_restaurant_blocklist(name: str) -> Set[str]:
    if not name or not isinstance(name, str):
        return set()
    clean_name = strip_accents(name.lower())
    clean_name = RE_FINAL_SYMBOLS.sub(" ", clean_name)
    tokens = clean_name.split()
    return {token for token in tokens if len(token) >= 3}

def clean_text_column(text_str: str, restaurant_tokens: Set[str]) -> str:
    if not text_str or not isinstance(text_str, str):
        return ""
    clean_name = strip_accents(text_str.lower())
    clean_name = RE_CAPS_INSTRUCTION.sub(" ", clean_name)
    clean_name = RE_PAREN.sub(" ", clean_name)
    for sym, replacement in NORMALIZE_MAP.items():
        clean_name = clean_name.replace(sym, replacement)
    clean_name = RE_QUANTITY.sub(" ", clean_name)
    clean_name = RE_INSTRUCTION.sub(" ", clean_name)
    clean_name = RE_MARKETING.sub(" ", clean_name)
    if restaurant_tokens:
        try:
            restaurant_re = regex.compile(r"\b(" + r"|".join(regex.escape(token) for token in restaurant_tokens) + r")\b", flags=regex.IGNORECASE)
            clean_name = restaurant_re.sub(" ", clean_name)
        except (regex.error, TypeError):
            pass
    clean_name = RE_FINAL_SYMBOLS.sub(" ", clean_name)
    clean_name = regex.sub(r"\s+", " ", clean_name).strip()
    clean_name = _final_token_cleanup(clean_name)
    return clean_name

# Load produced file and create dish_description_clean
file_path = OUTPUT_CSV
df2 = pd.read_csv(file_path)
print(f"Loaded '{file_path}'. Shape: {df2.shape}")
if 'restaurant_tokens' not in df2.columns:
    df2['restaurant_tokens'] = df2['restaurant_name'].fillna("").progress_apply(build_restaurant_blocklist)
else:
    import ast
    df2['restaurant_tokens'] = df2['restaurant_tokens'].fillna("{}").progress_apply(ast.literal_eval)

print("Cleaning 'dish_description' → 'dish_description_clean'…")
df2['dish_description'] = df2['dish_description'].fillna("")
df2['dish_description_clean'] = df2.progress_apply(
    lambda row: clean_text_column(row['dish_description'], row['restaurant_tokens']), axis=1
)
OUTPUT_CSV2 = DATA_CLEANED / 'final.csv'
df2.to_csv(OUTPUT_CSV2, index=False)
print('Wrote final CSV to:', OUTPUT_CSV2)

  from .autonotebook import tqdm as notebook_tqdm


Loaded '/Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean_with_description.csv'. Shape: (382, 20)


Cleaning progress: 100%|██████████| 382/382 [00:00<00:00, 114102.27it/s]


Cleaning 'dish_description' → 'dish_description_clean'…


Cleaning progress: 100%|██████████| 382/382 [00:00<00:00, 8562.14it/s]

Wrote final CSV to: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/final.csv



