# 02a_llm_descriptions_and_xd.ipynb

LLM-powered pipeline to:
1. Fill `dish_description` using **OpenAI API** from `dish_name`, `dish_name_to_be_processed`, `restaurant_name`, and `city_name`.
2. Produce **structured flavor scores** (taste/aroma/texture) per row.
3. Save updated CSV + XD vectors + a YAML lexicon derived from dimensions used.

**Inputs:** `project_x/data_cleaned/user_orders_clean.csv`

**Outputs:**
- Updated `user_orders_clean.csv` with `dish_description`
- `xd_vectors_from_llm.csv` (binary/float scores)
- `lexicon_llm.yaml` (dimensions and explanation)
- `llm_run_cache.jsonl` (idempotent cache)

> Set environment variable `OPENAI_API_KEY` before running.

In [1]:
from pathlib import Path
import os, json, time
from typing import Dict, Any
import pandas as pd
import numpy as np
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from openai import OpenAI
import yaml

ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
if not (ROOT / 'data_cleaned').exists():
    # climb up to find data_cleaned
    s = ROOT
    for _ in range(5):
        if (s / 'data_cleaned').exists():
            ROOT = s
            break
        s = s.parent

INPUT_CSV = ROOT / 'data_cleaned' / 'user_orders_clean.csv'
XD_OUT = ROOT / 'data_cleaned' / 'xd_vectors_from_llm.csv'
LEXICON_YAML = ROOT / 'data_cleaned' / 'lexicon_llm.yaml'
CACHE_PATH = ROOT / 'data_cleaned' / 'llm_run_cache.jsonl'

assert INPUT_CSV.exists(), f"Missing input CSV at {INPUT_CSV}"
print('ROOT:', ROOT)
print('CSV :', INPUT_CSV)

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
MODEL = 'gpt-4o-mini'
TEMPERATURE = 0.2
BATCH_SIZE = 10  # gentle batching


ROOT: /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x
CSV : /Users/sarveshaks/Documents/Documents - Sarvesha’s MacBook Pro (2)/Project Stea/project_x/data_cleaned/user_orders_clean.csv


In [2]:
df = pd.read_csv(INPUT_CSV)
needed = ['dish_name','dish_name_to_be_processed','restaurant_name','city_name']
for c in needed:
    assert c in df.columns, f"Missing column: {c}"

# Initialize column if not present
if 'dish_description' not in df.columns:
    df['dish_description'] = np.nan

# Load cache
cache = {}
if CACHE_PATH.exists():
    with open(CACHE_PATH, 'r') as f:
        for line in f:
            try:
                obj = json.loads(line)
                cache[obj['cache_key']] = obj
            except Exception:
                pass
print('Loaded cache items:', len(cache))


Loaded cache items: 0


In [3]:
df.head()

Unnamed: 0,city_name,restaurant_name,request_time_local,final_delivery_time_local,order_status,dish_name,item_quantity,customizations,customization_cost_local,special_instructions,item_price,order_price,currency,restaurant_tokens,dish_name_clean,customization_clean,dish_name_to_be_processed,dish_description,sugar_level,sodium_level
0,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Chicken shawarma gyro,1.0,Spicy Sauce,0.0,,13.95,70.8,USD,"{'davis', 'kingdom', 'gyro'}",chicken shawarma,spicy sauce,chicken shawarma spicy sauce,,,
1,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Crispy Fries,1.0,,,,9.95,70.8,USD,"{'davis', 'kingdom', 'gyro'}",crispy fries,,crispy fries,,,
2,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Chicken Shawarma Plate,1.0,,,,17.95,70.8,USD,"{'davis', 'kingdom', 'gyro'}",chicken shawarma plate,,chicken shawarma plate,,,
3,Portland,Gyro Kingdom (NE Davis),2024-12-01T02:37:08.000Z,2024-12-01T02:56:34.000Z,completed,Falafel Over Rice Plate,2.0,,,,35.9,70.8,USD,"{'davis', 'kingdom', 'gyro'}",falafel over rice plate,,falafel over rice plate,,,
4,Portland,Jack in the Box (4242 Se 82Nd Ave),2024-11-28T01:09:36.000Z,2024-11-28T01:18:20.000Z,completed,Jack's Spicy Chicken® w/ Cheese,1.0,Extra Swiss Cheese,0.75,,10.61,25.05,USD,"{'box', 'the', 'ave', 'jack'}",spicy chicken with cheese,cheese,spicy chicken with cheese cheese,,,


In [None]:
def make_cache_key(row: pd.Series) -> str:
    return json.dumps({
        'dish_name': str(row['dish_name']),
        'dish_name_to_be_processed': str(row['dish_name_to_be_processed']),
        'restaurant_name': str(row['restaurant_name']),
        'city_name': str(row['city_name'])
    }, sort_keys=True)

SYSTEM_PROMPT = (
    "You are a culinary analyst. Given fields about a menu item, produce a JSON object with: "
    "(1) 'description' — 1-2 sentences that summarize likely ingredients and preparation, and mention taste, aroma, texture; "
    "(2) 'taste' — scores in [0,1] for: sweet, spicy, sour, salty, umami, bitter; "
    "(3) 'aroma' — scores in [0,1] for: garlic, buttery, smoky, citrus, sweet_aroma, spiced; "
    "(4) 'texture' — scores in [0,1] for: crispy, creamy, soft, chewy, fried; "
    "Rules: Be concise and cautious; when uncertain, use 'likely' or 'commonly'. Do not invent brand-specific claims. "
    "Stay general and cuisine-aware. Return ONLY valid JSON, no markdown."
)

def build_user_prompt(row: pd.Series) -> str:
    data = {
        'dish_name': str(row['dish_name']),
        'dish_name_to_be_processed': str(row['dish_name_to_be_processed']),
        'restaurant_name': str(row['restaurant_name']),
        'city_name': str(row['city_name'])
    }
    return (
        "Use the following fields to infer a realistic culinary description and flavor scores.\n" +
        json.dumps(data, ensure_ascii=False)
    )

@retry(reraise=True, stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=1, max=30))
def call_llm(system_prompt: str, user_prompt: str) -> Dict[str, Any]:
    resp = client.chat.completions.create(
        model=MODEL,
        temperature=TEMPERATURE,
        messages=[
            {"role":"system","content":system_prompt},
            {"role":"user","content":user_prompt}
        ]
    )
    text = resp.choices[0].message.content
    return json.loads(text)

def clamp01(x):
    try:
        v = float(x)
        return max(0.0, min(1.0, v))
    except Exception:
        return 0.0

def extract_scores(obj: Dict[str, Any]):
    taste_keys = ['sweet','spicy','sour','salty','umami','bitter']
    aroma_keys = ['garlic','buttery','smoky','citrus','sweet_aroma','spiced']
    texture_keys = ['crispy','creamy','soft','chewy','fried']
    taste = {k: clamp01(obj.get('taste',{}).get(k,0)) for k in taste_keys}
    aroma = {k: clamp01(obj.get('aroma',{}).get(k,0)) for k in aroma_keys}
    texture = {k: clamp01(obj.get('texture',{}).get(k,0)) for k in texture_keys}
    return taste, aroma, texture


In [None]:
rows = df.index.tolist()
processed = 0

with open(CACHE_PATH, 'a') as cache_f:
    for i in tqdm(range(0, len(rows), BATCH_SIZE)):
        batch_idx = rows[i:i+BATCH_SIZE]
        for idx in batch_idx:
            row = df.loc[idx]
            cache_key = make_cache_key(row)
            if cache_key in cache:
                obj = cache[cache_key]
                result = obj['result']
            else:
                user_prompt = build_user_prompt(row)
                try:
                    result = call_llm(SYSTEM_PROMPT, user_prompt)
                except Exception as e:
                    # fall back to empty structure if the call keeps failing
                    result = {"description":"","taste":{},"aroma":{},"texture":{}}
                cache_line = {"cache_key": cache_key, "result": result}
                cache_f.write(json.dumps(cache_line, ensure_ascii=False) + "\n")
                cache[cache_key] = cache_line

            # Update dataframe
            if isinstance(result.get('description',''), str) and len(result['description'])>0:
                df.at[idx, 'dish_description'] = result['description']
            taste, aroma, texture = extract_scores(result)
            for k,v in {**taste, **aroma, **texture}.items():
                df.at[idx, k] = v

        # periodic checkpoint
        if (i // BATCH_SIZE) % 5 == 0:
            df.to_csv(INPUT_CSV, index=False)

        processed += len(batch_idx)

print('Processed rows:', processed)
df.to_csv(INPUT_CSV, index=False)
print('Updated CSV with descriptions and scores written to:', INPUT_CSV)


In [None]:
# Build XD matrix from the scored columns
taste_cols = ['sweet','spicy','sour','salty','umami','bitter']
aroma_cols = ['garlic','buttery','smoky','citrus','sweet_aroma','spiced']
texture_cols = ['crispy','creamy','soft','chewy','fried']
xd_cols = taste_cols + aroma_cols + texture_cols

xd_df = df[['dish_name_to_be_processed','dish_description'] + xd_cols].copy()
xd_df[xd_cols] = xd_df[xd_cols].astype('float32').fillna(0.0)
xd_df.to_csv(XD_OUT, index=False)
print('XD vectors saved to:', XD_OUT)


In [None]:
# Save a simple lexicon YAML documenting dimensions and meaning (for downstream use)
lexicon = {
  'taste': { 'dimensions': ['sweet','spicy','sour','salty','umami','bitter'], 'note': '0..1 scores estimated per item' },
  'aroma': { 'dimensions': ['garlic','buttery','smoky','citrus','sweet_aroma','spiced'], 'note': '0..1 scores estimated per item' },
  'texture': { 'dimensions': ['crispy','creamy','soft','chewy','fried'], 'note': '0..1 scores estimated per item' }
}
with open(LEXICON_YAML, 'w') as f:
    yaml.safe_dump(lexicon, f, sort_keys=False)
print('Lexicon YAML saved to:', LEXICON_YAML)
