In [153]:
import pandas as pd
import ast
import re
from rapidfuzz import process as rapidfuzz_process  # Faster fuzzy matching
import swifter  # Parallelized .apply
from functools import lru_cache

# --- Step 0: Load data ---
df = pd.read_csv("datasets/foodstruct_nutritional_facts.csv", encoding="utf-8", encoding_errors="replace") 
df_recipe = pd.read_csv("datasets/recipes_with_images.csv", encoding="utf-8", encoding_errors="replace") 

# --- Step 1: Parse ingredient lists safely ---
def parse_ingredient_list(ingredient_str):
    if isinstance(ingredient_str, str):
        try:
            return ast.literal_eval(ingredient_str)
        except Exception as e:
            print(f"Failed to parse ingredient list: {ingredient_str} — {e}")
            return []
    return []

df_recipe['Cleaned_Ingredients'] = df_recipe['Cleaned_Ingredients'].apply(parse_ingredient_list)


# --- Step 2: Clean ingredient strings ---
common_patterns = [
    r"\b(?:peeled|chopped|smashed|halved|drained|optional|divided|reserved|seeded|grated|discarded|cut(?: into)?(?:.*)?|coarsely|finely|room temperature|thinly(?:.*)?|loosely packed|plus extra(?:.*)?)\b",
    r"\([^)]*\)",  # remove anything in parentheses
    r"\d+[\/\d]*",  # remove digits and fractions
    r"\b(?:teaspoons?|tablespoons?|cups?|pounds?|ounces?|cloves?|slices?|cans?|sticks?|pieces?|bags?|packages?)\b",
    r"\b(?:fresh|small|large|medium|extra-virgin|unsalted|dried|sweet|hot|cold|frozen|bottled|baby|thin|coarse|fine|creamy|whole|firm|boneless|skinless|crusty|chilled|shredded|meat|with|from|of|and|in|to|for|not|very|cut|or|into|on|at|the|a|an)\b",
]

def clean_ingredient(ingredient):
    ingredient = ingredient.lower()
    for pattern in common_patterns:
        ingredient = re.sub(pattern, '', ingredient)
    ingredient = re.sub(r'[^a-z\s]', '', ingredient)  # Remove punctuation/numbers
    ingredient = ' '.join(ingredient.split())         # Normalize whitespace
    return ingredient.strip()

# --- Step 3: Prepare fuzzy matching ---
nutrient_foods = df['Food Name'].str.lower().tolist()
skip_ingredients = {'salt', 'pepper', 'water', 'oil'}

@lru_cache(maxsize=None)
def fuzzy_match(ingredient):
    if ingredient in skip_ingredients or len(ingredient) < 4:
        return None
    match = rapidfuzz_process.extractOne(ingredient, nutrient_foods)
    if match:
        name, score, _ = match
        return name if score > 85 else None
    return None

unique_ingredients = {
    clean_ingredient(raw)
    for ingredient_list in df_recipe['Cleaned_Ingredients']
    for raw in ingredient_list
    if clean_ingredient(raw) not in skip_ingredients
}

ingredient_to_food = {ing: fuzzy_match(ing) for ing in unique_ingredients}

# --- Step 4: Calculate nutrients ---
nutrient_columns = df.columns.difference(['Food Name', 'Category Name'])
def average_duplicate_foods(df):
    df['food_name_lower'] = df['Food Name'].str.lower()
    grouped = df.groupby('food_name_lower', as_index=False)

    # Average nutrient values across duplicates, keep first for non-nutrient fields
    df_avg = grouped[nutrient_columns].mean()
    df_meta = grouped[['Food Name', 'Category Name']].first()

    # Merge averaged nutrients with metadata
    df_cleaned = pd.merge(df_meta, df_avg, on='food_name_lower')
    return df_cleaned.set_index('food_name_lower')

# Create cleaned and indexed food dataframe
df_indexed = average_duplicate_foods(df)

# Optional: print how many duplicates were averaged
num_duplicates = df['food_name_lower'].duplicated().sum()
if num_duplicates > 0:
    print(f"✅ Averaged {num_duplicates} duplicate food entries.")

unique_nutrient_errors = set()

def calculate_nutrients(ingredient_list):
    if not isinstance(ingredient_list, list):
        return pd.Series(0.0, index=nutrient_columns)

    total = pd.Series(0.0, index=nutrient_columns)

    for raw in ingredient_list:
        if not isinstance(raw, str):
            continue
        cleaned = clean_ingredient(raw)
        food_name = ingredient_to_food.get(cleaned)
        if isinstance(food_name, str) and food_name in df_indexed.index:
            match_row = df_indexed.loc[food_name]
            if isinstance(match_row, pd.DataFrame):
                match_row = match_row.iloc[0]  # Take the first row if duplicate
            try:
                total += match_row[nutrient_columns]
            except Exception as e:
                error_msg = f"Error adding nutrients for '{food_name}': {e}"
                unique_nutrient_errors.add(error_msg)
                continue

    return total

# --- Step 5: Calculate nutrients per recipe ---
df_nutrients = df_recipe['Cleaned_Ingredients'].apply(calculate_nutrients)

# --- Step 6: Print any errors ---
if unique_nutrient_errors:
    print("\nUnique errors during nutrient addition:")
    for err in unique_nutrient_errors:
        print(err)


# --- Step 7: Combine and save ---
df_recipe_total = pd.concat([df_recipe, df_nutrients], axis=1)

# --- Step 8: Split into Macro and Micro Nutrients ---
macro_cols = [
    'Carbs', 'Fats', 'Fiber', 'Net carbs', 'Protein',
    'Saturated Fat', 'Monounsaturated Fat', 'Polyunsaturated fat',
    'Trans Fat', 'Sugar', 'Starch'
]

micro_cols = [col for col in df_nutrients.columns if col not in macro_cols and col != 'Calories']

# Create new columns as arrays of (name, value) tuples
df_recipe_total['Macro_Nutrients'] = df_recipe_total.apply(
    lambda row: [(col, row[col]) for col in macro_cols if col in row], axis=1
)

df_recipe_total['Micro_Nutrients'] = df_recipe_total.apply(
    lambda row: [(col, row[col]) for col in micro_cols if col in row], axis=1
)

# Drop individual nutrient columns (except Calories, if you want to keep it)
cols_to_keep = [
    'Title', 'Ingredients', 'Instructions', 'Image_Name', 'Cleaned_Ingredients',
    'Calories', 'Macro_Nutrients', 'Micro_Nutrients'
]
df_recipe_total = df_recipe_total[cols_to_keep]

df_recipe_total.to_csv("datasets/recipes_with_images_and_nutrients.csv", index=False)


✅ Averaged 2 duplicate food entries.


In [154]:
df_recipe_total

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients,Calories,Macro_Nutrients,Micro_Nutrients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...",4837.0,"[(Carbs, 409.95000000000005), (Fats, 326.28000...","[(Calcium, 2.071999999999999), (Cholesterol, 0..."
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...",664.0,"[(Carbs, 137.03000000000003), (Fats, 11.989999...","[(Calcium, 1.3250000000000002), (Cholesterol, ..."
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...",1693.0,"[(Carbs, 283.1), (Fats, 52.97), (Fiber, 84.0),...","[(Calcium, 2.1189999999999998), (Cholesterol, ..."
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int...",3120.0,"[(Carbs, 138.66), (Fats, 261.18), (Fiber, 9.0)...","[(Calcium, 7.649000000000001), (Cholesterol, 1..."
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",508.0,"[(Carbs, 133.3), (Fats, 0.59), (Fiber, 7.6), (...","[(Calcium, 0.155), (Cholesterol, 0.0), (Cholin..."
...,...,...,...,...,...,...,...,...
13496,Brownie Pudding Cake,"['1 cup all-purpose flour', '2/3 cup unsweeten...",Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408,"[1 cup all-purpose flour, 2/3 cup unsweetened ...",3572.0,"[(Carbs, 415.16), (Fats, 184.01), (Fiber, 63.3...","[(Calcium, 6.939000000000001), (Cholesterol, n..."
13497,Israeli Couscous with Roasted Butternut Squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...,"[1 preserved lemon, 1 1/2 pound butternut squa...",3284.0,"[(Carbs, 301.96000000000004), (Fats, 253.20999...","[(Calcium, 2.29), (Cholesterol, 0.215), (Choli..."
13498,Rice with Soy-Glazed Bonito Flakes and Sesame ...,['Leftover katsuo bushi (dried bonito flakes) ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...,[Leftover katsuo bushi (dried bonito flakes) f...,1277.0,"[(Carbs, 160.9), (Fats, 50.85), (Fiber, 13.200...","[(Calcium, 1.024), (Cholesterol, 0.0), (Cholin..."
13499,Spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344,[1 stick (1/2 cup) plus 1 tablespoon unsalted ...,1529.0,"[(Carbs, 56.96), (Fats, 138.26), (Fiber, 23.4)...","[(Calcium, 0.837), (Cholesterol, 0.304), (Chol..."
