# Data cleaning for non vege recipes

In [1]:
import pandas as pd
import requests
from pathlib import Path
import json
from pathlib import Path
import sys

project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.utils import *

In [2]:
DATA_DIR = Path(project_root / "data")
RECIPES_FILE = DATA_DIR / "recipes_non_vege.json"
ECV_FILE = DATA_DIR / "impact_co2.json"
OUTPUT_FILE = DATA_DIR / "recipes_non_vege_with_ecv.json"

AGRIBALYSE_API_URL = "https://impactco2.fr/api/v1/alimentation?category=group&language=fr"

#### Get the raw recipes dataset

In [3]:
with open(RECIPES_FILE, "r", encoding="utf-8") as f:
    recipes = json.load(f)

len(recipes)

2037

In [4]:
recipes = convert_recipe_numbers(recipes)

In [5]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(recipes, f, ensure_ascii=False, indent=4)

#### Get the Agribalyse API

In [6]:
if not (ECV_FILE).exists():
    response = requests.get(AGRIBALYSE_API_URL)
    if response.status_code == 200:
        ecv_data = response.json()  # converts the response to a Python dictionary
        with open(DATA_DIR / "impact_co2.json", "w", encoding="utf-8") as f:
            json.dump(ecv_data, f, ensure_ascii=False, indent=4)
else:
    with open(ECV_FILE, "r", encoding="utf-8") as f:
        ecv_data = json.load(f)

In [7]:
# Flatten CO2 reference to a dict: slug -> ecv
ecv_dict = {}
for category in ecv_data["data"]:
    for item in category["items"]:
        ecv_dict[item["slug"].lower()] = item["ecv"]


## 1. Filter all ingredients which doesn't exist in Agribalyse database

In [8]:
filtered_recipes = filter_recipes_based_on_ecv(recipes, ecv_dict)
print(len(filtered_recipes))

1782


In [9]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

## 2.Filter all vegetarian recipes

In [10]:
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=False)
print(len(filtered_recipes))

1185


In [11]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [12]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

168

In [13]:
ing_empty_quantity = get_empty_quantity(filtered_recipes)
ing_with_quantity = handle_empty_quantity(filtered_recipes, ing_empty_quantity)
filtered_recipes = update_recipes_with_quantities(filtered_recipes, ing_with_quantity)
ing_with_quantity

{'riz thaï': (800.0, 'g'),
 'pulpe de tomate': (291.6666666666667, 'g'),
 'fond de veau': (3.0, 'l'),
 'courgette': (1.0, 'kg'),
 'coulis de tomate': (250.0, 'g'),
 'aile de poulet': (500.0, 'g'),
 'poulet': (458.92857142857144, 'g'),
 'carotte': (372.11538461538464, 'g'),
 'riz blanc': (173.33333333333334, 'g'),
 'tomates concassées': (297.14285714285717, 'g'),
 'tomate': (497.6470588235294, 'g'),
 'lapin': (600.0, 'g'),
 'rôti de porc': (633.3333333333334, 'g'),
 'blanc de poulet': (498.14814814814815, 'g'),
 'bouillon de boeuf': (2.0, '60'),
 'oignon nouveau': (300.0, 'g'),
 'beurre': (47.80769230769231, 'g'),
 'crevettes': (340.0, 'g'),
 'boeuf': (617.3076923076923, 'g'),
 'beurre demi-sel': (65.0, 'g'),
 'canard': (200.0, 'g'),
 'riz': (246.66666666666666, 'g'),
 'porc': (372.72727272727275, 'g'),
 'rôti de veau': (1.2333333333333334, 'kg'),
 'saumon': (675.0, 'g'),
 'moules': (150.0, 'g'),
 'veau': (600.0, 'g'),
 'sauté de porc': (575.0, 'g'),
 'riz basmati': (350.0, 'g'),
 'pomm

In [14]:
ing_empty_quantity = get_empty_quantity(filtered_recipes)
ing_empty_quantity

['Beurre spécial cuisson Elle & Vire',
 'Echine de porc 750 g U',
 'graisse de canard',
 'coeur de canard']

In [15]:
spec_ing_with_quantity = {
    'Beurre spécial cuisson Elle & Vire': (200.0, 'g'),   # typical cooking butter block
    'graisse de canard': (250.0, 'g'),                    # a common jar or tub amount
    'coeur de canard': (400.0, 'g'),                      # average weight of several duck hearts
    'Echine de porc 750 g U': (750.0, 'g')                # already specified
}

In [16]:
filtered_recipes = update_recipes_with_quantities(filtered_recipes, spec_ing_with_quantity)

In [17]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

## 3.Standalize the Unity et Quantity

In [18]:
filtered_recipes = normalize_ingredients(filtered_recipes)
len(filtered_recipes)

1185

In [19]:
# filter vege
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=False)
len(filtered_recipes)

1099

In [20]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [21]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

152

In [22]:
spec_ing = extract_spec_ingredients(filtered_recipes)
print(f"Total spec ingredients: {len(spec_ing)}")

Total spec ingredients: 101


In [23]:
spec_ing_weights_non_vege = {
    "poulet": 150,                                 # generic portion
    "tomates pelées": 400,                          # per can
    "moules": 20,                                  # per mussel
    "crevettes roses": 20,                          # per shrimp
    "avocat": 200,                                 # per avocado
    "travers de porc": 200,                          # per rack
    "oignons nouveaux": 30,                         # per onion
    "escalopes de porc": 150,                        # per escalope
    "galettes de riz": 10,                           # per rice cake
    "rôti de porc": 1800,                            # per roast
    "cuisses de lapin": 250,                          # per leg
    "boeuf haché": 200,                               # per portion
    "magrets de canard": 350,                         # per breast
    "saumon": 200,                                   # per fillet
    "crevettes": 20,                                 # per shrimp
    "saumon fumé": 50,                               # per slice
    "yaourt nature": 125,                             # per pot
    "mangues": 300,                                  # per mango
    "carottes": 70,                                  # per carrot
    "beurre doux": 10,                                # per pat
    "jaunes d'oeuf": 18,                              # per yolk
    "blancs de poulet": 150,                           # per piece
    "oeuf": 60,                                      # per egg
    "jaune d'oeuf": 18,                               # per yolk
    "cuisses de poulet": 250,                          # per leg
    "rôti de veau": 1800,                              # per roast
    "paupiettes de veau": 200,                         # per piece
    "côte de boeuf": 300,                              # per steak
    "escalopes de poulet": 150,                         # per escalope
    "filets de poulet": 150,                            # per fillet
    "porc": 150,                                       # generic portion
    "pommes de terre": 150,                             # per potato
    "courgette": 200,                                   # per zucchini
    "carotte": 70,                                      # per carrot
    "lapin": 200,                                       # per piece
    "côte de porc": 200,                                # per chop
    "tête de veau": 1200,                               # whole head
    "tomates": 100,                                     # per tomato
    "escalopes de veau": 150,                             # per escalope
    "chorizo": 50,                                      # per link slice
    "blancs d'oeuf": 30,                                # per egg white
    "cuisses de canard": 300,                            # per leg
    "yaourt": 125,                                      # per pot
    "beurre": 30,                                       # per pat
    "échine de porc": 200,                               # per portion
    "côtes de porc": 200,                                # per chop
    "joue de porc": 80,                                  # per cheek
    "pieds de veau": 300,                                # per foot
    "poireau": 150,                                     # per leek
    "magret de canard": 350,                             # per breast
    "oeufs": 60,                                         # per egg
    "tomate": 100,                                       # per tomato
    "poulet de 1,3 à 1,4 kg u": 1350,                   # whole chicken in range
    "canard": 1200,                                      # per whole duck
    "fond de veau": 250,                                 # per package
    "jarrets de porc": 400,                               # per shank
    "rôti de boeuf": 1800,                                # per roast
    "pomme": 150,                                         # per apple
    "ailes de poulet": 50,                                 # per wing
    "courgettes": 200,                                     # per zucchini
    "pied de veau": 300,                                   # per foot
    "poulets": 1200,                                       # per whole chicken
    "langue de veau": 400,                                  # per tongue
    "pilons de poulet": 200,                                # per drumstick
    "rognon de veau": 100,                                  # per kidney
    "poireaux": 150,                                        # per leek
    "lieu noir": 200,                                        # per fillet
    "blanc de poulet": 150,                                   # per piece
    "pâte à pizza": 400,                                       # per dough base
    "veau": 200,                                               # per portion
    "riz": 200,                                                # per cup cooked
    "langues de veau": 400,                                     # per two tongues
    "bouillon de boeuf": 250,                                    # per carton
    "filet de poulet": 150,                                       # per fillet
    "rognons de veau": 100,                                        # per pair
    "lapin entier": 1200,                                          # per whole rabbit
    "petit os de veau": 100,                                        # per small os
    "escalope de poulet": 150,                                      # per escalope
    "yaourts brassés": 125,                                         # per pot
    "aiguillettes de poulet": 100,                                   # per portion
    "poulet entier": 1300,                                           # per whole chicken
    "osso bucco de veau": 300,                                        # per cut
    "tomates concassées": 400,                                         # per can
    "râble de lapin": 250,                                             # per rack
    "pomme de terre": 150,                                             # per potato
    "yaourts": 125,                                                    # per pot
    "bananes": 120,                                                    # per banana
    "tomates confites à l’huile": 50,                                    # per jar portion
    "coeurs de canard": 50,                                               # per heart
    "mangue": 300,                                                        # per mango
    "râbles de lapin": 250,                                               # per rack
    "boeuf": 200,                                                         # per portion
    "côtes de veau": 300,                                                 # per chop
    "jarrets de veau": 400,                                               # per shank
    "tomates séchées": 10,                                                # per small portion
    "foie gras de canard du sud-ouest cru surgelé": 150,                  # per block
    "ris de veau": 300,                                                   # per portion
    "banane": 120,                                                        # per banana
    "tomates cerise": 20,                                                 # per cherry
    "cuisse de canard": 300,                                              # per leg
    "blanc de poireau": 50                                                # per section
}


In [24]:
spec_ing = add_weights(spec_ing, spec_ing_weights_non_vege)

In [25]:
standalized_recipes = standardize_recipes(filtered_recipes, spec_ing)
len(standalized_recipes)

1099

In [26]:
spec_ing = extract_spec_ingredients(standalized_recipes)
spec_ing

[]

In [27]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(standalized_recipes, f, ensure_ascii=False, indent=4)

## 4.Calculate ECV based on unit and quantity

In [28]:
standalized_recipes = scale_ecv(standalized_recipes)
len(standalized_recipes)

1099

In [29]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(standalized_recipes, f, ensure_ascii=False, indent=4)

# Same procedure for recipes vegetarian

In [30]:
RECIPES_FILE = DATA_DIR / "recipes_vege.json"
OUTPUT_FILE = DATA_DIR / "recipes_vege_with_ecv.json"

In [31]:
with open(RECIPES_FILE, "r", encoding="utf-8") as f:
    recipes = json.load(f)

len(recipes)

1755

In [32]:
recipes = convert_recipe_numbers(recipes)

In [33]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(recipes, f, ensure_ascii=False, indent=4)

In [34]:
filtered_recipes = filter_recipes_based_on_ecv(recipes, ecv_dict)
print(len(filtered_recipes))

1146


In [35]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [36]:
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=True)
print(len(filtered_recipes))

1027


In [37]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [38]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

97

In [39]:
ing_empty_quantity = get_empty_quantity(filtered_recipes)
ing_with_quantity = handle_empty_quantity(filtered_recipes, ing_empty_quantity)
filtered_recipes = update_recipes_with_quantities(filtered_recipes, ing_with_quantity)

In [40]:
ing_empty_quantity = get_empty_quantity(filtered_recipes)
ing_empty_quantity

['Peau de banane', 'feuille de riz', 'pâte à pizza']

In [41]:
spec_ing_with_quantity.update({
    'pâte à pizza': (250.0, 'g'),       # typical pizza dough ball weight
    'feuille de riz': (50.0, 'g'),      # ~10 rice papers (often sold in 50g packs)
    'Peau de banane': (60.0, 'g')       # average peel from one medium banana
})

In [42]:
filtered_recipes = update_recipes_with_quantities(filtered_recipes, spec_ing_with_quantity)

In [43]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [44]:
filtered_recipes = normalize_ingredients(filtered_recipes)
len(filtered_recipes)

1027

In [45]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [46]:
# filter non vege
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=True)
len(filtered_recipes)

990

In [47]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [48]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

94

In [49]:
spec_ing = extract_spec_ingredients(filtered_recipes)
print(f"Total spec ingredients: {len(spec_ing)}")

Total spec ingredients: 55


In [50]:
spec_ing_weights_vege = {
    "tomates": 100,                               # per tomato
    "courgettes": 200,                             # per zucchini
    "poireau": 150,                                # per leek
    "carottes": 70,                                # per carrot
    "pommes de terre": 150,                        # per potato
    "oeuf": 60,                                    # per egg
    "carotte": 70,                                 # per carrot
    "tomate": 100,                                 # per tomato
    "courgette": 200,                              # per zucchini
    "oeufs": 60,                                   # per egg
    "bananes": 120,                                # per banana
    "poireaux": 150,                               # per leek
    "pomme de terre": 150,                         # per potato
    "pâte à pizza": 400,                           # per dough base
    "pommes": 150,                                 # per apple
    "avocat": 200,                                 # per avocado
    "tomates séchées à l'huile d'olive": 10,       # per small portion
    "feuilles de riz": 10,                         # per sheet
    "banane": 120,                                 # per banana
    "jaunes d'oeuf": 18,                           # per yolk
    "baguette": 250,                               # per loaf
    "yaourt à la grecque": 150,                    # per pot
    "sauce tomate": 400,                            # per jar
    "jaune d'oeuf": 18,                            # per yolk
    "tomates cerise": 20,                           # per cherry
    "tomates séchées": 10,                          # per small portion
    "yaourts natures": 125,                         # per pot
    "blette": 200,                                 # per bunch
    "peau de banane": 50,                           # per banana
    "mangues": 300,                                # per mango
    "pomme": 150,                                  # per apple
    "yaourt bulgare": 125,                          # per pot
    "galettes de riz": 10,                          # per rice cake
    "blancs d'oeuf": 30,                            # per egg white
    "avocats": 200,                                # per avocado
    "yaourt nature": 125,                            # per pot
    "double concentré de tomates": 50,             # per tube
    "sirop d'érable": 20,                           # per tablespoon
    "beurre tendre": 10,                             # per pat
    "potiron": 400,                                 # per small pumpkin portion
    "tofu": 200,                                     # per block
    "mangue": 300,                                  # per mango
    "beurre": 30,                                   # per pat
    "carotte pourpre": 70,                           # per carrot
    "vermicelles de riz": 80,                        # per 100g dry
    "yaourt": 125,                                   # per pot
    "blanc d'oeuf": 30,                              # per egg white
    "yaourt brassé": 125,                             # per pot
    "tomates pelées": 400,                             # per can
    "blanc de poireau": 50,                             # per section
    "laitue": 200,                                     # per head
    "yaourts": 125,                                     # per pot
    "petits beurre": 10,                                # per cookie
    "yaourts brassés": 125,                              # per pot
    "beurre allégé": 10,                                 # per pat
    "beurre demi-sel": 10,                               # per pat
    "purée de tomate": 400                              # per can
}


In [51]:
spec_ing = add_weights(spec_ing, spec_ing_weights_vege)

In [52]:
standalized_recipes = standardize_recipes(filtered_recipes, spec_ing)
len(standalized_recipes)

990

In [53]:
spec_ing = extract_spec_ingredients(standalized_recipes)
spec_ing

[{'ingredient_name': "jaune d'oeuf",
  'quantity': 9.0,
  'recipes': ['TARTE AU CHOCOLAT ET AMANDES GRILLEES PAR ZAKOO44 (végétarien)',
   'Soupe anatolienne végétarienne au yaourt et à la menthe (soupe yayla)',
   'Tourte aux blettes légère et gourmande',
   'Tarte au sucre du Nord (briochée)',
   'Brookies à partager',
   'Tarte abricots miel et lavande',
   'Fondant léger au chocolat',
   'Tarte aux fraises légère',
   'Pâte sablée maison légère',
   'Tarte légère au fromage blanc',
   'Petits gratins de champignons crème légère au foie gras',
   'Salade légère crevettes et oranges',
   'Sablé breton, crème légère vanille et rhubarbe pochée par Johanna Le Pape',
   'Mayonnaise légère à ma façon',
   'Sauce hollandaise légère',
   'Tarte au chèvre et épices légère',
   'Mayonnaise légère au curry',
   'Petits pains légers sans pétrissage',
   'Millefeuille léger aux fraises et au mascarpone']}]

In [54]:
standalized_recipes = scale_ecv(standalized_recipes)

In [55]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(standalized_recipes, f, ensure_ascii=False, indent=4)