# Meal Planner Recipe Preprocessing

This notebook cleans and enriches the raw recipe dataset so it can be reused across projects. It performs the following steps:

- load the raw CSV hosted on Hugging Face (Edamam-based recipe dataset)
- extract convenient nutrient totals (fat, carbs, protein)
- normalize label and ingredient fields for easier downstream filtering
- drop obviously invalid rows and save a processed CSV ready for analysis or app ingestion

Update the configuration section below if your raw file lives elsewhere or you want a different output path.


In [7]:
# Preprocessing config and imports
import re
import json
import pandas as pd

# Input/Output paths
INPUT_JSON = "full_format_recipes.json"
OUTPUT_JSONL = "recipes_for_embeddings.jsonl"
OUTPUT_CSV = "recipes_processed_for_embeddings.csv"
MAX_EMBED_TEXT_CHARS = 2000

print('Preprocessing cell added: config loaded')

Preprocessing cell added: config loaded


In [8]:
# Helper functions: normalization, canonical key, synthesis
def _normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9\s\-]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s


def normalize_ingredient_token(tok: str) -> str:
    tok = str(tok)
    tok = tok.strip().lower()
    tok = re.sub(r"\([^)]*\)", "", tok)  # remove parentheses
    tok = re.sub(r"\d+\/\d+|\d+\.\d+|\d+", "", tok)  # remove numbers/fractions
    tok = re.sub(r"\b(cups?|cup|tablespoons?|tbsp|teaspoons?|tsp|grams?|g|kg|ounces?|oz|pounds?|lb|pinch|slice|slices)\b", "", tok)
    tok = re.sub(r"[^a-z0-9\s\-]", "", tok)
    tok = re.sub(r"\s+", " ", tok)
    return tok.strip()


def normalize_ingredients(val) -> list:
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return []
    if isinstance(val, list):
        tokens = [normalize_ingredient_token(x) for x in val if x and str(x).strip()]
    else:
        # conservative split on comma when a list isn't available
        tokens = [normalize_ingredient_token(x) for x in str(val).split(',') if x and x.strip()]
    tokens = [t for t in tokens if t]
    return tokens


def canonical_key(title, ingredients_list):
    norm_title = _normalize_text(title)
    norm_ing_sorted = sorted(set([_normalize_text(x) for x in ingredients_list if x]))
    return norm_title + ' || ' + ' | '.join(norm_ing_sorted)


def synthesize_title_from_ingredients(ings):
    if not ings:
        return ""
    # use first meaningful ingredient tokens as fallback title
    return (ings[0].title()) if isinstance(ings[0], str) else ''

In [None]:
# Run preprocessing: apply normalization, handle missing, dedupe, save outputs

with open(INPUT_JSON, "r") as f:
    epicurious_raw = json.load(f)
    
df = pd.DataFrame(epicurious_raw)
before_count = len(df)

# Ensure ingredients column is normalized to lists
if 'ingredients' not in df.columns:
    df['ingredients'] = [[] for _ in range(len(df))]
else:
    df['ingredients'] = df['ingredients'].apply(normalize_ingredients)

# Normalize titles and synthesize when missing
if 'title' not in df.columns:
    df['title'] = ['' for _ in range(len(df))]

# Fill NaNs
df['title'] = df['title'].fillna('')

# Synthesize title when missing but ingredients present
missing_title_mask = df['title'].str.strip() == ''
has_ings_mask = df['ingredients'].map(len) > 0
synth_count = missing_title_mask & has_ings_mask
if synth_count.any():
    df.loc[synth_count, 'title'] = df.loc[synth_count, 'ingredients'].apply(synthesize_title_from_ingredients)

# Drop rows missing both title and ingredients
keep_mask = ~((df['title'].str.strip() == '') & (df['ingredients'].map(len) == 0))
df = df[keep_mask].copy()

# Add normalized fields for deduplication
df['norm_title'] = df['title'].apply(_normalize_text)
df['norm_ingredients'] = df['ingredients'].apply(lambda lst: sorted(set([_normalize_text(x) for x in lst if x])))

df['dedupe_key'] = df.apply(lambda r: r['norm_title'] + ' || ' + ' | '.join(r['norm_ingredients']), axis=1)

before_dedupe = len(df)
# prefer rows with more ingredients and longer titles
df['_ing_count'] = df['norm_ingredients'].apply(len)
df['_title_len'] = df['title'].apply(lambda s: len(str(s)))

df = df.sort_values(['dedupe_key', '_ing_count', '_title_len'], ascending=[True, False, False])
df = df.drop_duplicates('dedupe_key', keep='first')
after_dedupe = len(df)

# Build text_for_embedding
def build_embedding_text(row):
    title = str(row['title']).strip()
    ings = ', '.join(row['norm_ingredients'])
    text = f"{title}. Ingredients: {ings}" if ings else title
    if len(text) > MAX_EMBED_TEXT_CHARS:
        return text[:MAX_EMBED_TEXT_CHARS]
    return text

df['text_for_embedding'] = df.apply(build_embedding_text, axis=1)

# Final filter: ensure non-empty embedding text
df = df[df['text_for_embedding'].str.strip() != '']

# Save outputs
df.to_csv(OUTPUT_CSV, index=False)

with open(OUTPUT_JSONL, 'w', encoding='utf-8') as f:
    for _, r in df[['title', 'text_for_embedding']].iterrows():
        json.dump({'title': r['title'], 'text_for_embedding': r['text_for_embedding']}, f, ensure_ascii=False)
        f.write('\n')

print(f"Rows before: {before_count}; after drop: {before_dedupe}; after dedupe: {after_dedupe}; final saved: {len(df)}")

# Quick sample check
print('Sample entries:')
print(df[['title','text_for_embedding']].head(5).to_dict(orient='records'))

Rows before: 20130; after drop: 20111; after dedupe: 18222; final saved: 18222
Sample entries:
[{'title': '"Adult" Pimiento Cheese ', 'text_for_embedding': '"Adult" Pimiento Cheese. Ingredients: a - jar diced pimientos, coarsely grated sharp cheddar, crackers, crudits, or large garlic cloves, to mayonnaise, toasted baguette'}, {'title': '"Blanketed" Eggplant ', 'text_for_embedding': '"Blanketed" Eggplant. Ingredients: drained capers, dried oregano, extra-virgin olive oil, fresh basil leaves, large fresh mint leaves, large garlic cloves slivered flattened, medium onion chopped, olive oil, small japanese eggplants peeled, tomatoes'}, {'title': '"Bloody Mary" Tomato Toast with Celery and Horseradish ', 'text_for_embedding': '"Bloody Mary" Tomato Toast with Celery and Horseradish. Ingredients: cayenne pepper, celery stalks thinly sliced, coarsely chopped celery leaves, extra-virgin olive oil, finely grated fresh horseradish divided, freshly ground black pepper, grape tomatoes halved, koshe