In [2]:
import pandas as pd
import ast
import re
from rapidfuzz import process as rapidfuzz_process  # Faster fuzzy matching
import swifter  # Parallelized .apply
from functools import lru_cache

# --- Step 0: Load data ---
df = pd.read_csv("datasets/foodstruct_nutritional_facts.csv", encoding="utf-8", encoding_errors="replace") 
df_recipe = pd.read_csv("datasets/recipes_with_images.csv", encoding="utf-8", encoding_errors="replace") 

# --- Step 1: Parse ingredient lists safely ---
def parse_ingredient_list(ingredient_str):
    if isinstance(ingredient_str, str):
        try:
            return ast.literal_eval(ingredient_str)
        except Exception as e:
            print(f"Failed to parse ingredient list: {ingredient_str} — {e}")
            return []
    return []

df_recipe['Cleaned_Ingredients'] = df_recipe['Cleaned_Ingredients'].apply(parse_ingredient_list)

# --- Step 2: Clean ingredient strings ---
common_patterns = [
    r"\b(?:peeled|chopped|smashed|halved|drained|optional|divided|reserved|seeded|grated|discarded|cut(?: into)?(?:.*)?|coarsely|finely|room temperature|thinly(?:.*)?|loosely packed|plus extra(?:.*)?)\b",
    r"\([^)]*\)",  # remove anything in parentheses
    r"\d+[\/\d]*",  # remove digits and fractions
    r"\b(?:teaspoons?|tablespoons?|cups?|pounds?|ounces?|cloves?|slices?|cans?|sticks?|pieces?|bags?|packages?)\b",
    r"\b(?:fresh|small|large|medium|extra-virgin|unsalted|dried|sweet|hot|cold|frozen|bottled|baby|thin|coarse|fine|creamy|whole|firm|boneless|skinless|crusty|chilled|shredded|meat|with|from|of|and|in|to|for|not|very|cut|or|into|on|at|the|a|an)\b",
]

def clean_ingredient(ingredient):
    ingredient = ingredient.lower()
    for pattern in common_patterns:
        ingredient = re.sub(pattern, '', ingredient)
    ingredient = re.sub(r'[^a-z\s]', '', ingredient)  # Remove punctuation/numbers
    ingredient = ' '.join(ingredient.split())         # Normalize whitespace
    return ingredient.strip()

# --- Step 3: Prepare fuzzy matching ---
nutrient_foods = df['Food Name'].str.lower().tolist()
skip_ingredients = {'salt', 'pepper', 'water', 'oil'}

@lru_cache(maxsize=None)
def fuzzy_match(ingredient):
    if ingredient in skip_ingredients or len(ingredient) < 4:
        return None
    match = rapidfuzz_process.extractOne(ingredient, nutrient_foods)
    if match:
        name, score, _ = match
        return name if score > 85 else None
    return None

unique_ingredients = {
    clean_ingredient(raw)
    for ingredient_list in df_recipe['Cleaned_Ingredients']
    for raw in ingredient_list
    if clean_ingredient(raw) not in skip_ingredients
}

ingredient_to_food = {ing: fuzzy_match(ing) for ing in unique_ingredients}

# --- Step 4: Calculate nutrients ---
nutrient_columns = df.columns.difference(['Food Name', 'Category Name'])
def average_duplicate_foods(df):
    df['food_name_lower'] = df['Food Name'].str.lower()
    grouped = df.groupby('food_name_lower', as_index=False)

    # Average nutrient values across duplicates, keep first for non-nutrient fields
    df_avg = grouped[nutrient_columns].mean()
    df_meta = grouped[['Food Name', 'Category Name']].first()

    # Merge averaged nutrients with metadata
    df_cleaned = pd.merge(df_meta, df_avg, on='food_name_lower')
    return df_cleaned.set_index('food_name_lower')

# Create cleaned and indexed food dataframe
df_indexed = average_duplicate_foods(df)

# Optional: print how many duplicates were averaged
num_duplicates = df['food_name_lower'].duplicated().sum()
if num_duplicates > 0:
    print(f"✅ Averaged {num_duplicates} duplicate food entries.")

unique_nutrient_errors = set()

def calculate_nutrients(ingredient_list):
    if not isinstance(ingredient_list, list):
        return pd.Series(0.0, index=nutrient_columns)

    total = pd.Series(0.0, index=nutrient_columns)

    for raw in ingredient_list:
        if not isinstance(raw, str):
            continue
        cleaned = clean_ingredient(raw)
        food_name = ingredient_to_food.get(cleaned)
        if isinstance(food_name, str) and food_name in df_indexed.index:
            match_row = df_indexed.loc[food_name]
            if isinstance(match_row, pd.DataFrame):
                match_row = match_row.iloc[0]  # Take the first row if duplicate
            try:
                total += match_row[nutrient_columns]
            except Exception as e:
                error_msg = f"Error adding nutrients for '{food_name}': {e}"
                unique_nutrient_errors.add(error_msg)
                continue

    return total

# --- Step 5: Calculate nutrients per recipe ---
df_nutrients = df_recipe['Cleaned_Ingredients'].apply(calculate_nutrients)

# --- Step 6: Print any errors ---
if unique_nutrient_errors:
    print("\nUnique errors during nutrient addition:")
    for err in unique_nutrient_errors:
        print(err)

# --- Step 7: Combine and save ---
df_recipe_total = pd.concat([df_recipe, df_nutrients], axis=1)
df_recipe_total.to_csv("datasets/recipe_nutrition_output.csv", index=False)


✅ Averaged 2 duplicate food entries.
