### *Suggested Module to be loaded before starting anything*

In [None]:
# Note that to run some of these you would ned a GPU with >= 12GB VRAM to run Mistral smoothly, or try quantized versions of RAM-constraine

%pip install -r requirements.txt

### Loading train.json file for training

In [2]:
import json

with open("datasets/recipe-ingredients/train.json", "r") as f:
    recipes = json.load(f)

print(f"Loaded {len(recipes)} recipes.")


Loaded 39774 recipes.


### Define Target dishes

In [3]:
with open("/home/classes/ee7722/ee772210/Downloads/food-101/meta/classes.txt", "r") as f:
    food101_dishes = [line.strip() for line in f]

print("Total classes:", len(food101_dishes))


Total classes: 101


### Search for matches in train.json

In [4]:
from collections import defaultdict, Counter

dish_to_ingredients = defaultdict(list)

for dish in food101_dishes:
    word_match = dish.replace("_", " ").lower()

    for recipe in recipes:
        joined_ingredients = " ".join(recipe["ingredients"]).lower()

        if word_match in joined_ingredients:
            dish_to_ingredients[dish].extend(recipe["ingredients"])

# Now count the most common ingredients for each dish
dish2ingredients_final = {}
for dish, all_ingredients in dish_to_ingredients.items():
    ingredient_counts = Counter(all_ingredients)
    dish2ingredients_final[dish] = [ing for ing, _ in ingredient_counts.most_common(10)]


### Save as dish2ingredients.json

In [5]:
with open("dish2ingredients.json", "w") as f:
    json.dump(dish2ingredients_final, f, indent=2)

print("✅ Saved dish2ingredients.json")


✅ Saved dish2ingredients.json


### Testing

In [6]:
with open("dish2ingredients.json") as f:
    dish_map = json.load(f)

detected_dish = "pizza"
ingredients = dish_map.get(detected_dish, ["Unknown dish"])
print("Inferred Ingredients:", ingredients)


Inferred Ingredients: ['olive oil', 'pizza doughs', 'pizza sauce', 'salt', 'shredded mozzarella cheese', 'mozzarella cheese', 'extra-virgin olive oil', 'grated parmesan cheese', 'pizza crust', 'garlic cloves']


### More explicit generation that then contains classes of food not available in FOOD-101 and can do ingredient matching

In [8]:
import json
from collections import defaultdict, Counter
from fuzzywuzzy import fuzz

# === CONFIGURATION ===
CLASSES_FILE = "/home/classes/ee7722/ee772210/Downloads/food-101/meta/classes.txt"
RECIPES_FILE = "datasets/recipe-ingredients/train.json"
OUTPUT_FILE = "dish2ingredients.json"

FUZZY_THRESHOLD = 80  # Match quality (0–100); lower = more aggressive

# === STEP 1: Load Food-101 class names ===
with open(CLASSES_FILE, "r") as f:
    food101_dishes = [line.strip() for line in f]

# === STEP 2: Load Kaggle recipe dataset ===
with open(RECIPES_FILE, "r") as f:
    recipes = json.load(f)

# === STEP 3: Match dishes to recipes by fuzzy keyword matching ===
dish_to_ingredients = defaultdict(list)

for dish in food101_dishes:
    dish_name = dish.replace("_", " ").lower()

    for recipe in recipes:
        ingredients = [ing.lower() for ing in recipe["ingredients"]]
        combined = " ".join(ingredients)

        # Fuzzy match the dish name to the combined ingredients string
        score = fuzz.partial_ratio(dish_name, combined)
        if score >= FUZZY_THRESHOLD:
            dish_to_ingredients[dish].extend(ingredients)

print(f"✅ Matched recipes for {len(dish_to_ingredients)} out of {len(food101_dishes)} classes.")

# === STEP 4: Count and save top ingredients for each dish ===
dish2ingredients_final = {}
for dish, all_ingredients in dish_to_ingredients.items():
    ingredient_counts = Counter(all_ingredients)
    top_ingredients = [ing for ing, _ in ingredient_counts.most_common(10)]
    dish2ingredients_final[dish] = top_ingredients

# Save to JSON
with open(OUTPUT_FILE, "w") as f:
    json.dump(dish2ingredients_final, f, indent=2)

print(f"✅ Saved dish2ingredients.json to: {OUTPUT_FILE}")


✅ Matched recipes for 68 out of 101 classes.
✅ Saved dish2ingredients.json to: dish2ingredients.json
