# 02b_llm_descriptions_only.ipynb

Generate **dish_description** (no scoring) for each row using OpenAI, based on
`dish_name`, `dish_name_to_be_processed`, `restaurant_name`, and `city_name`.

### Inputs
- `project_x/data_cleaned/user_orders_clean.csv`

### Outputs
- `project_x/data_cleaned/user_orders_clean_with_description.csv`
- Cache: `project_x/data_cleaned/llm_run_cache.jsonl`

**Note:** Set the `OPENAI_API_KEY` in your environment (or use a `.env`).

In [1]:
# Optional: one-time installs (uncomment if needed)
# %pip install openai==1.51.0 pandas numpy tqdm tenacity python-dotenv
from pathlib import Path
import os, json
from typing import Dict, Any
import pandas as pd
import numpy as np
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential
from openai import OpenAI
from dotenv import load_dotenv

# Load .env if present
load_dotenv()

# Project root autodetect
cwd = Path.cwd()
if cwd.name == 'notebooks' and (cwd.parent / 'data_cleaned').exists():
    ROOT = cwd.parent
else:
    search = cwd
    ROOT = None
    for _ in range(5):
        if (search / 'data_cleaned').exists():
            ROOT = search
            break
        search = search.parent
    if ROOT is None:
        ROOT = cwd

DATA_CLEANED = ROOT / 'data_cleaned'
INPUT_CSV = DATA_CLEANED / 'user_orders_clean.csv'
OUTPUT_CSV = DATA_CLEANED / 'user_orders_clean_with_description.csv'
CACHE_PATH = DATA_CLEANED / 'llm_run_cache.jsonl'

print('ROOT:', ROOT)
print('INPUT_CSV:', INPUT_CSV)
print('OUTPUT_CSV:', OUTPUT_CSV)
assert INPUT_CSV.exists(), f"Missing input CSV at {INPUT_CSV}"

# OpenAI client
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
assert OPENAI_API_KEY, 'OPENAI_API_KEY not set. Use a .env or export the variable.'
MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')
TEMPERATURE = float(os.getenv('OPENAI_TEMPERATURE', '0.2'))
BATCH_SIZE = 10
ROW_LIMIT = int(os.getenv('ROW_LIMIT', '0'))  # 0 means no explicit limit
CHECKPOINT_INTERVAL = 50

client = OpenAI(api_key=OPENAI_API_KEY)


ROOT: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x
INPUT_CSV: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean.csv
OUTPUT_CSV: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean_with_description.csv


In [2]:
# Load data
df = pd.read_csv(INPUT_CSV)
needed = ['dish_name','dish_name_to_be_processed','restaurant_name','city_name']
for c in needed:
    assert c in df.columns, f"Missing column: {c}"

if 'dish_description' not in df.columns:
    df['dish_description'] = np.nan

print('Rows in CSV:', len(df))
df[['dish_name','dish_name_to_be_processed','restaurant_name','city_name']].head(3)

Rows in CSV: 382


Unnamed: 0,dish_name,dish_name_to_be_processed,restaurant_name,city_name
0,Chicken shawarma gyro,chicken shawarma spicy sauce,Gyro Kingdom (NE Davis),Portland
1,Crispy Fries,crispy fries,Gyro Kingdom (NE Davis),Portland
2,Chicken Shawarma Plate,chicken shawarma plate,Gyro Kingdom (NE Davis),Portland


In [3]:
# Cache loader
cache = {}
if CACHE_PATH.exists():
    with open(CACHE_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                obj = json.loads(line)
                cache[obj['cache_key']] = obj
            except Exception:
                pass
print('Loaded cache items:', len(cache))

def make_cache_key(row: pd.Series) -> str:
    return json.dumps({
        'dish_name': str(row['dish_name']),
        'dish_name_to_be_processed': str(row['dish_name_to_be_processed']),
        'restaurant_name': str(row['restaurant_name']),
        'city_name': str(row['city_name'])
    }, sort_keys=True, ensure_ascii=False)


Loaded cache items: 382


In [4]:
# Prompts (descriptions only)
SYSTEM_PROMPT = (
    "You are a culinary analyst. Given basic fields about a menu item, produce a single JSON object with EXACTLY one key: 'description'. "
    "The 'description' must be 40–65 words and ALWAYS include all three sensory dimensions — taste, aroma, and texture. "
    "Use clear, natural language to create one compact sentence. \n\n"
    "• Taste cues: sweet, salty, sour, bitter, umami/savory, spicy/heat, peppery, tangy, bittersweet, smoky-sweet, honeyed, caramelized. \n"
    "• Aroma cues: go beyond 'garlicky/buttery/smoky/citrusy'; also use oniony, nutty, toasty, roasted, charred, woody, earthy, floral, herby/herbal "
    "(basil, oregano, thyme, rosemary, dill, mint), warm-spice (cumin, coriander, fennel, anise, clove, cardamom, cinnamon), sesame, peanutty, coconutty, lemongrass, kaffir-lime, vinegar-sharp. \n"
    "• Texture cues: crispy, crunchy, crackly, airy, flaky, tender, succulent/juicy, springy, bouncy, toothsome/al dente, silky/smooth/velvety, thick, coarse, crumbly, sticky, gooey, saucy, glaze-coated. \n\n"
    "Evenness requirement: Each description MUST contain at least one cue from EACH of taste, aroma, and texture. "
    "Prefer 1–3 cues per dimension, and vary vocabulary across items so the same aroma term isn’t reused excessively. "
    "If exact properties are uncertain, infer typical ones from cuisine, ingredients, or cooking method using hedges like 'likely', 'typically', or 'commonly'. \n\n"
    "Rules: Use only generally known culinary knowledge; do not invent brand-specific or proprietary details. "
    "Keep tone neutral, concise, and cuisine-aware. Mention sides or serving style only if space allows after all three sensory dimensions are covered. \n\n"
    "Output rules: Return ONLY valid JSON with a single key 'description' and a single string value. No markdown, no extra keys, no comments. "
    "The string must be one grammatical sentence that explicitly includes at least one taste cue, one aroma cue, and one texture cue as defined above. "
    "Avoid overusing 'garlicky', 'buttery', 'smoky', or 'citrusy' unless most natural; use broader, varied vocabulary."
)

def build_user_prompt(row: pd.Series) -> str:
    payload = {
        'dish_name': str(row['dish_name']),
        'dish_name_to_be_processed': str(row['dish_name_to_be_processed']),
        'restaurant_name': str(row['restaurant_name']),
        'city_name': str(row['city_name'])
    }
    return (
        "Use the following fields to infer a realistic culinary description.\n" +
        json.dumps(payload, ensure_ascii=False)
    )

def safe_parse_json(text: str) -> Dict[str, Any]:
    try:
        obj = json.loads(text)
        if isinstance(obj, dict) and 'description' in obj and isinstance(obj['description'], str):
            return obj
    except Exception:
        pass
    return {'description': ''}

@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=30), reraise=True)
def call_llm(system_prompt: str, user_prompt: str) -> Dict[str, Any]:
    resp = client.chat.completions.create(
        model=MODEL,
        temperature=TEMPERATURE,
        messages=[{"role":"system","content":system_prompt}, {"role":"user","content":user_prompt}],
    )
    text = resp.choices[0].message.content
    return safe_parse_json(text)


In [5]:
# Worklist: only rows with missing/empty description
needs = df[(df['dish_description'].isna()) | (df['dish_description'].astype(str).str.len() == 0)].copy()
ROW_LIMIT = int(os.getenv('ROW_LIMIT', '0'))
if ROW_LIMIT > 0:
    needs = needs.head(ROW_LIMIT)
rows = needs.index.tolist()
print(f"Planned rows this run: {len(rows)} (of {len(df)})")


Planned rows this run: 382 (of 382)


In [6]:
# Main loop with caching and checkpoints
processed = 0
with open(CACHE_PATH, 'a', encoding='utf-8') as cache_f:
    for i in tqdm(range(0, len(rows), BATCH_SIZE)):
        batch_idx = rows[i:i+BATCH_SIZE]
        for idx in batch_idx:
            row = df.loc[idx]
            key = make_cache_key(row)
            if key in cache and cache[key].get('result', {}).get('description'):
                result = cache[key]['result']
            else:
                up = build_user_prompt(row)
                try:
                    result = call_llm(SYSTEM_PROMPT, up)
                except Exception:
                    result = {'description': ''}
                cache_line = {'cache_key': key, 'result': result}
                cache_f.write(json.dumps(cache_line, ensure_ascii=False) + "\n")
                cache[key] = cache_line

            desc = result.get('description', '').strip()
            if desc:
                df.at[idx, 'dish_description'] = desc
        processed += len(batch_idx)

        # periodic checkpoint
        if processed % CHECKPOINT_INTERVAL == 0:
            df.to_csv(OUTPUT_CSV, index=False)
            print('Checkpoint saved at', processed)

print('Processed rows:', processed)
df.to_csv(OUTPUT_CSV, index=False)
print('Final CSV written to:', OUTPUT_CSV)


  df.at[idx, 'dish_description'] = desc
100%|██████████| 39/39 [00:00<00:00, 1084.12it/s]

Checkpoint saved at 50
Checkpoint saved at 100
Checkpoint saved at 150
Checkpoint saved at 200
Checkpoint saved at 250
Checkpoint saved at 300
Checkpoint saved at 350
Processed rows: 382
Final CSV written to: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean_with_description.csv





In [7]:
# ─────────────────────────────────────────────────────────────
# Re-define all cleaning helpers and regex (for a standalone cell)
# ─────────────────────────────────────────────────────────────
import pandas as pd
import regex
import unicodedata
from typing import Set, Final, Pattern
from tqdm.auto import tqdm

# Initialize tqdm for pandas
tqdm.pandas(desc="Cleaning progress")

# --- 1. Helper Functions ---

def strip_accents(text: str) -> str:
    """Strips accents from a string and converts to ASCII."""
    try:
        nfkd_form = unicodedata.normalize("NFKD", str(text))
        return "".join(
            [c for c in nfkd_form if not unicodedata.combining(c)]
        )
    except TypeError:
        return ""

# --- 2. Compiled Regex Patterns ---

# Remove parentheticals
RE_PAREN: Final[Pattern] = regex.compile(r"[\(\[].*?[\)\]]")

# Normalize symbols
NORMALIZE_MAP: Final[dict[str, str]] = {
    "w/": " with ",
    "&": " and ",
}

# Remove quantities, units, and serving sizes
RE_QUANTITY: Final[Pattern] = regex.compile(
    r"(\b(serves?|feeds?)\s*\d+\b"
    r"|\b\d+\s*for\s*\$?\d+(\.\d+)?\b"
    r"|\b(x\d+|\d+x)\b"
    r"|\b\d+(\.\d+)?\s*(pc|pcs|ct|oz|lb|lbs|kg|g|ml|l|inch|in|'|\")s?\b"
    r"|\b\d+(\.\d+)?s?\b)",
    flags=regex.IGNORECASE,
)

# Remove instruction phrases
RE_INSTRUCTION: Final[Pattern] = regex.compile(
    r"(\b(no|without|w/o|extra|add|added|light|less|more|sub|swap)\s+[\p{L}]+\b"
    r"|\bon\s+the\s+side\b)",
    flags=regex.IGNORECASE,
)

# Marketing, packaging, and other noise tokens
MARKETING_BLOCKLIST: Final[Set[str]] = {
    "appetizer", "add", "added", "box", "bucket", "combo", "ct",
    "deal", "deluxe", "dessert", "double", "entree", "family",
    "feeds", "for", "g", "in", "inch", "jumbo", "junior", "kids",
    "kg", "l", "large", "lb", "lbs", "meal", "medium", "mini",
    "ml", "oz", "pack", "party", "pc", "pcs", "platter", "regular",
    "serves", "single", "small", "special", "starter", "tray",
    "triple", "value", "xl", "xxl", "side"
}
RE_MARKETING: Final[Pattern] = regex.compile(
    r"\b(" + r"|".join(MARKETING_BLOCKLIST) + r")\b", flags=regex.IGNORECASE
)

# Final cleanup: remove all non-letter, non-space characters
RE_FINAL_SYMBOLS: Final[Pattern] = regex.compile(r"[^a-z\s]")

# Domain-specific 1-char tokens to keep
DOMAIN_1CHAR_EXCEPTIONS: Final[Set[str]] = {"bbq"}

# Customization-specific noise (also useful for descriptions)
RE_CAPS_INSTRUCTION: Final[Pattern] = regex.compile(
    r"\b([A-Z]{2,}\s+){1,}[A-Z]{2,}\b"
)

# --- 3. Token Cleanup Functions ---

def _final_token_cleanup(text: str) -> str:
    """De-duplicates tokens and removes 1-char tokens."""
    if not text:
        return ""
    tokens = text.split()
    final_tokens = []
    last_token = None
    for token in tokens:
        if token == last_token:
            continue
        if len(token) > 1 or token in DOMAIN_1CHAR_EXCEPTIONS:
            final_tokens.append(token)
        last_token = token
    return " ".join(final_tokens)

def build_restaurant_blocklist(name: str) -> Set[str]:
    """Creates a set of noise tokens from a restaurant name."""
    if not name or not isinstance(name, str):
        return set()
    clean_name = strip_accents(name.lower())
    clean_name = RE_FINAL_SYMBOLS.sub(" ", clean_name)
    tokens = clean_name.split()
    return {token for token in tokens if len(token) >= 3}

# --- 4. Generic Cleaning Function ---

def clean_text_column(text_str: str, restaurant_tokens: Set[str]) -> str:
    """
    Applies the full cleaning pipeline to a generic text string.
    """
    if not text_str or not isinstance(text_str, str):
        return ""

    # 1. Start with ASCII conversion and lowercasing
    clean_name = strip_accents(text_str.lower())
    
    # 2. Remove all-caps instructional noise
    clean_name = RE_CAPS_INSTRUCTION.sub(" ", clean_name)
    
    # 3. Remove parentheticals
    clean_name = RE_PAREN.sub(" ", clean_name)
    
    # 4. Normalize common symbols
    for sym, replacement in NORMALIZE_MAP.items():
        clean_name = clean_name.replace(sym, replacement)
        
    # 5. Remove quantities and units
    clean_name = RE_QUANTITY.sub(" ", clean_name)
    
    # 6. Remove instruction phrases
    clean_name = RE_INSTRUCTION.sub(" ", clean_name)
    
    # 7. Remove marketing/packaging words
    clean_name = RE_MARKETING.sub(" ", clean_name)
    
    # 8. Remove restaurant-specific tokens (if any)
    if restaurant_tokens:
        try:
            restaurant_re = regex.compile(
                r"\b(" + r"|".join(regex.escape(token) for token in restaurant_tokens) + r")\b",
                flags=regex.IGNORECASE
            )
            clean_name = restaurant_re.sub(" ", clean_name)
        except (regex.error, TypeError):
            pass
            
    # 9. Final symbol strip and whitespace collapse
    clean_name = RE_FINAL_SYMBOLS.sub(" ", clean_name)
    clean_name = regex.sub(r"\s+", " ", clean_name).strip()
    
    # 10. Final token-level cleanup (dedup, 1-char removal)
    clean_name = _final_token_cleanup(clean_name)

    return clean_name

# ─────────────────────────────────────────────────────────────
# Load and Clean "dish_description"
# ─────────────────────────────────────────────────────────────

# --- 1. Load Data ---
file_path = DATA_CLEANED /'user_orders_clean_with_description.csv'
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded '{file_path}'. Shape: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: File not found at {file_path}")
    # Stop execution if file isn't found
    raise
except Exception as e:
    print(f"An error occurred loading the CSV: {e}")
    raise

# --- 2. Check for Required Columns ---
if "dish_description" not in df.columns:
    raise KeyError("Column 'dish_description' not found in the CSV.")
if "restaurant_tokens" not in df.columns:
    print("Warning: 'restaurant_tokens' column not found. Will create it.")
    df['restaurant_tokens'] = df['restaurant_name'].fillna("").progress_apply(build_restaurant_blocklist)
else:
    # The 'restaurant_tokens' column is a string representation of a set.
    # We need to convert it back to a set object for the cleaner to use.
    # Safely evaluate the string as a Python literal (it's just a set)
    import ast
    print("Converting 'restaurant_tokens' column from string to set...")
    df['restaurant_tokens'] = df['restaurant_tokens'].fillna("{}").progress_apply(ast.literal_eval)

# --- 3. Apply Cleaning to "dish_description" ---
print("Cleaning 'dish_description' column...")
df['dish_description'] = df['dish_description'].fillna("")

df['dish_description_clean'] = df.progress_apply(
    lambda row: clean_text_column(row['dish_description'], row['restaurant_tokens']),
    axis=1
)

# --- 4. Show Results ---
print("\n--- 'dish_description' Cleaning Examples ---")
# Filter for rows where the description was not empty to see the effect
sample_df = df[df['dish_description'] != ""][[
    'dish_description', 
    'dish_description_clean'
]].sample(n=min(10, len(df[df['dish_description'] != ""])), random_state=42)

display(sample_df)

# Show the full DataFrame head to see all columns
print("\n--- DataFrame Head with New Clean Column ---")
display(df.head())

  from .autonotebook import tqdm as notebook_tqdm


Successfully loaded '/Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean_with_description.csv'. Shape: (382, 20)
Converting 'restaurant_tokens' column from string to set...


Cleaning progress: 100%|██████████| 382/382 [00:00<00:00, 125076.04it/s]


Cleaning 'dish_description' column...


Cleaning progress: 100%|██████████| 382/382 [00:00<00:00, 6800.29it/s]


--- 'dish_description' Cleaning Examples ---





Unnamed: 0,dish_description,dish_description_clean
280,Lamb over Rice is a popular dish in Middle Eas...,lamb over rice is popular dish middle eastern ...
248,The Spicy Ghost Chicken Curry is likely a vibr...,the spicy ghost chicken is likely vibrant dish...
218,The Chipotle Cheddar Chalupa with black beans ...,the chipotle cheddar chalupa with black beans ...
327,The 'Create Your Own Mine' dessert likely feat...,the create your own mine likely features cream...
56,Sri Lankan Iced Coffee is likely a refreshing ...,sri lankan iced coffee is likely refreshing be...
370,These hotcakes are likely made from a sweet ba...,these hotcakes are likely made from sweet batt...
33,The chocolate chip cookie is a classic America...,the chocolate chip cookie is classic american ...
114,"The Large FUZE® Iced Tea, particularly the ras...",fuze iced tea particularly raspberry peach fla...
373,Chicken Alfredo is a creamy pasta dish likely ...,chicken alfredo is creamy pasta dish likely fe...
237,The ketchup packet commonly contains a sweet a...,the ketchup packet commonly contains sweet and...



--- DataFrame Head with New Clean Column ---


Unnamed: 0,city_name,restaurant_name,request_time_local,final_delivery_time_local,order_status,dish_name,item_quantity,customizations,customization_cost_local,special_instructions,...,order_price,currency,restaurant_tokens,dish_name_clean,customization_clean,dish_name_to_be_processed,dish_description,sugar_level,sodium_level,dish_description_clean
0,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Chicken shawarma gyro,1.0,Spicy Sauce,0.0,,...,70.8,USD,"{kingdom, davis, gyro}",chicken shawarma,spicy sauce,chicken shawarma spicy sauce,The Chicken Shawarma Gyro is a popular Middle ...,,,the chicken shawarma is popular middle eastern...
1,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Crispy Fries,1.0,,,,...,70.8,USD,"{kingdom, davis, gyro}",crispy fries,,crispy fries,"Crispy fries, commonly found in American cuisi...",,,crispy fries commonly found american cuisine a...
2,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Chicken Shawarma Plate,1.0,,,,...,70.8,USD,"{kingdom, davis, gyro}",chicken shawarma plate,,chicken shawarma plate,The Chicken Shawarma Plate likely features mar...,,,the chicken shawarma plate likely features mar...
3,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Falafel Over Rice Plate,2.0,,,,...,70.8,USD,"{kingdom, davis, gyro}",falafel over rice plate,,falafel over rice plate,The Falafel Over Rice Plate likely features cr...,,,the falafel over rice plate likely features cr...
4,Portland,Jack in the Box (4242 Se 82Nd Ave),2024-11-28T01:09:36.000Z,2024-11-28T01:18:20.000Z,completed,Jack's Spicy Chicken® w/ Cheese,1.0,Extra Swiss Cheese,0.75,,...,25.05,USD,"{box, the, jack, ave}",spicy chicken with cheese,cheese,spicy chicken with cheese cheese,Jack's Spicy Chicken® w/ Cheese likely feature...,,,spicy chicken with cheese likely features cris...


In [15]:

OUTPUT_CSV2 = DATA_CLEANED/'final.csv'
df.to_csv(OUTPUT_CSV2)