# Data cleaning for non vege recipes

In [1]:
import pandas as pd
import requests
from pathlib import Path
import json
from src.utils import *

In [2]:
DATA_DIR = Path("data")
RECIPES_FILE = DATA_DIR / "recipes_non_vege.json"
ECV_FILE = DATA_DIR / "impact_co2.json"
OUTPUT_FILE = DATA_DIR / "recipes_non_vege_with_ecv.json"

AGRIBALYSE_API_URL = "https://impactco2.fr/api/v1/alimentation?category=group&language=fr"

#### Get the raw recipes dataset

In [3]:
with open(RECIPES_FILE, "r", encoding="utf-8") as f:
    recipes = json.load(f)

len(recipes)

2037

#### Get the Agribalyse API

In [4]:
if not (ECV_FILE).exists():
    response = requests.get(AGRIBALYSE_API_URL)
    if response.status_code == 200:
        ecv_data = response.json()  # converts the response to a Python dictionary
        with open(DATA_DIR / "impact_co2.json", "w", encoding="utf-8") as f:
            json.dump(ecv_data, f, ensure_ascii=False, indent=4)
else:
    with open(ECV_FILE, "r", encoding="utf-8") as f:
        ecv_data = json.load(f)

In [5]:
# Flatten CO2 reference to a dict: slug -> ecv
ecv_dict = {}
for category in ecv_data["data"]:
    for item in category["items"]:
        ecv_dict[item["slug"].lower()] = item["ecv"]

## 1. Filter all ingredients which doesn't exist in Agribalyse database

In [6]:
filtered_recipes = filter_recipes_based_on_ecv(recipes, ecv_dict)
print(len(filtered_recipes))

1782


## 2.Filter all vegetarian recipes

In [7]:
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=False)
print(len(filtered_recipes))

1190


In [8]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [9]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

169

## 3.Standalize the Unity et Quantity

In [10]:
filtered_recipes = normalize_ingredients(filtered_recipes)
len(filtered_recipes)

1190

In [11]:
# filter vege
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=False)
len(filtered_recipes)

1079

In [12]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [13]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

149

In [14]:
spec_ing = extract_spec_ingredients(filtered_recipes)
print(f"Total unique ingredients: {len(spec_ing)}")
#print(spec_ing)

Total unique ingredients: 102


In [15]:
spec_ing_weights_non_vege = {
    "poulet": 150,                                 # generic portion
    "tomates pelées": 400,                          # per can
    "moules": 20,                                  # per mussel
    "crevettes roses": 20,                          # per shrimp
    "avocat": 200,                                 # per avocado
    "travers de porc": 200,                          # per rack
    "oignons nouveaux": 30,                         # per onion
    "escalopes de porc": 150,                        # per escalope
    "galettes de riz": 10,                           # per rice cake
    "rôti de porc": 1800,                            # per roast
    "cuisses de lapin": 250,                          # per leg
    "boeuf haché": 200,                               # per portion
    "magrets de canard": 350,                         # per breast
    "saumon": 200,                                   # per fillet
    "crevettes": 20,                                 # per shrimp
    "saumon fumé": 50,                               # per slice
    "yaourt nature": 125,                             # per pot
    "mangues": 300,                                  # per mango
    "carottes": 70,                                  # per carrot
    "beurre doux": 10,                                # per pat
    "jaunes d'oeuf": 18,                              # per yolk
    "blancs de poulet": 150,                           # per piece
    "oeuf": 60,                                      # per egg
    "jaune d'oeuf": 18,                               # per yolk
    "cuisses de poulet": 250,                          # per leg
    "rôti de veau": 1800,                              # per roast
    "paupiettes de veau": 200,                         # per piece
    "côte de boeuf": 300,                              # per steak
    "escalopes de poulet": 150,                         # per escalope
    "filets de poulet": 150,                            # per fillet
    "porc": 150,                                       # generic portion
    "pommes de terre": 150,                             # per potato
    "courgette": 200,                                   # per zucchini
    "carotte": 70,                                      # per carrot
    "lapin": 200,                                       # per piece
    "côte de porc": 200,                                # per chop
    "tête de veau": 1200,                               # whole head
    "tomates": 100,                                     # per tomato
    "escalopes de veau": 150,                             # per escalope
    "chorizo": 50,                                      # per link slice
    "blancs d'oeuf": 30,                                # per egg white
    "cuisses de canard": 300,                            # per leg
    "yaourt": 125,                                      # per pot
    "beurre": 10,                                       # per pat
    "échine de porc": 200,                               # per portion
    "côtes de porc": 200,                                # per chop
    "joue de porc": 80,                                  # per cheek
    "pieds de veau": 300,                                # per foot
    "poireau": 150,                                     # per leek
    "magret de canard": 350,                             # per breast
    "oeufs": 60,                                         # per egg
    "tomate": 100,                                       # per tomato
    "poulet de 1,3 à 1,4 kg u": 1350,                   # whole chicken in range
    "canard": 1200,                                      # per whole duck
    "fond de veau": 250,                                 # per package
    "jarrets de porc": 400,                               # per shank
    "rôti de boeuf": 1800,                                # per roast
    "pomme": 150,                                         # per apple
    "ailes de poulet": 50,                                 # per wing
    "courgettes": 200,                                     # per zucchini
    "pied de veau": 300,                                   # per foot
    "poulets": 1200,                                       # per whole chicken
    "langue de veau": 400,                                  # per tongue
    "pilons de poulet": 200,                                # per drumstick
    "rognon de veau": 100,                                  # per kidney
    "poireaux": 150,                                        # per leek
    "lieu noir": 200,                                        # per fillet
    "blanc de poulet": 150,                                   # per piece
    "pâte à pizza": 400,                                       # per dough base
    "veau": 200,                                               # per portion
    "riz": 200,                                                # per cup cooked
    "langues de veau": 400,                                     # per two tongues
    "bouillon de boeuf": 250,                                    # per carton
    "filet de poulet": 150,                                       # per fillet
    "rognons de veau": 100,                                        # per pair
    "lapin entier": 1200,                                          # per whole rabbit
    "petit os de veau": 100,                                        # per small os
    "escalope de poulet": 150,                                      # per escalope
    "yaourts brassés": 125,                                         # per pot
    "aiguillettes de poulet": 100,                                   # per portion
    "poulet entier": 1300,                                           # per whole chicken
    "osso bucco de veau": 300,                                        # per cut
    "tomates concassées": 400,                                         # per can
    "râble de lapin": 250,                                             # per rack
    "pomme de terre": 150,                                             # per potato
    "yaourts": 125,                                                    # per pot
    "bananes": 120,                                                    # per banana
    "tomates confites à l’huile": 50,                                    # per jar portion
    "coeurs de canard": 50,                                               # per heart
    "mangue": 300,                                                        # per mango
    "râbles de lapin": 250,                                               # per rack
    "boeuf": 200,                                                         # per portion
    "côtes de veau": 300,                                                 # per chop
    "jarrets de veau": 400,                                               # per shank
    "tomates séchées": 10,                                                # per small portion
    "foie gras de canard du sud-ouest cru surgelé": 150,                  # per block
    "ris de veau": 300,                                                   # per portion
    "banane": 120,                                                        # per banana
    "tomates cerise": 20,                                                 # per cherry
    "cuisse de canard": 300,                                              # per leg
    "blanc de poireau": 50                                                # per section
}


In [16]:
spec_ing = add_weights(spec_ing, spec_ing_weights_non_vege)

In [17]:
standalized_recipes = standardize_recipes(filtered_recipes, spec_ing)
len(standalized_recipes)

1079

In [18]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(standalized_recipes, f, ensure_ascii=False, indent=4)

## 4.Calculate ECV based on unit and quantity

In [19]:
standalized_recipes = scale_ecv(standalized_recipes)
len(standalized_recipes)

1079

In [20]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(standalized_recipes, f, ensure_ascii=False, indent=4)

# Same procedure for recipes vegetarian

In [21]:
RECIPES_FILE = DATA_DIR / "recipes_vege.json"
OUTPUT_FILE = DATA_DIR / "recipes_vege_with_ecv.json"

In [22]:
with open(RECIPES_FILE, "r", encoding="utf-8") as f:
    recipes = json.load(f)

len(recipes)

1755

In [23]:
filtered_recipes = filter_recipes_based_on_ecv(recipes, ecv_dict)
print(len(filtered_recipes))

1147


In [24]:
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=True)
print(len(filtered_recipes))

1021


In [25]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [26]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)
len(unique_ing)

97

In [27]:
filtered_recipes = normalize_ingredients(filtered_recipes)
len(filtered_recipes)

1021

In [28]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [29]:
# filter non vege
filtered_recipes = filter_recipes(filtered_recipes, ecv_data, is_vege=True)
len(filtered_recipes)

969

In [30]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(filtered_recipes, f, ensure_ascii=False, indent=4)

In [31]:
# Check unique ingredients in all recipes
unique_ing = get_unique_ingredients(filtered_recipes)

In [32]:
spec_ing = extract_spec_ingredients(filtered_recipes)
print(f"Total unique ingredients: {len(spec_ing)}")

Total unique ingredients: 55


In [33]:
spec_ing_weights_vege = {
    "tomates": 100,                               # per tomato
    "courgettes": 200,                             # per zucchini
    "poireau": 150,                                # per leek
    "carottes": 70,                                # per carrot
    "pommes de terre": 150,                        # per potato
    "oeuf": 60,                                    # per egg
    "carotte": 70,                                 # per carrot
    "tomate": 100,                                 # per tomato
    "courgette": 200,                              # per zucchini
    "oeufs": 60,                                   # per egg
    "bananes": 120,                                # per banana
    "poireaux": 150,                               # per leek
    "pomme de terre": 150,                         # per potato
    "pâte à pizza": 400,                           # per dough base
    "pommes": 150,                                 # per apple
    "avocat": 200,                                 # per avocado
    "tomates séchées à l'huile d'olive": 10,       # per small portion
    "feuilles de riz": 10,                         # per sheet
    "banane": 120,                                 # per banana
    "jaunes d'oeuf": 18,                           # per yolk
    "baguette": 250,                               # per loaf
    "yaourt à la grecque": 150,                    # per pot
    "sauce tomate": 400,                            # per jar
    "jaune d'oeuf": 18,                            # per yolk
    "tomates cerise": 20,                           # per cherry
    "tomates séchées": 10,                          # per small portion
    "yaourts natures": 125,                         # per pot
    "blette": 200,                                 # per bunch
    "peau de banane": 50,                           # per banana
    "mangues": 300,                                # per mango
    "pomme": 150,                                  # per apple
    "yaourt bulgare": 125,                          # per pot
    "galettes de riz": 10,                          # per rice cake
    "blancs d'oeuf": 30,                            # per egg white
    "avocats": 200,                                # per avocado
    "yaourt nature": 125,                            # per pot
    "double concentré de tomates": 50,             # per tube
    "sirop d'érable": 20,                           # per tablespoon
    "beurre tendre": 10,                             # per pat
    "potiron": 400,                                 # per small pumpkin portion
    "tofu": 200,                                     # per block
    "mangue": 300,                                  # per mango
    "beurre": 10,                                   # per pat
    "carotte pourpre": 70,                           # per carrot
    "vermicelles de riz": 80,                        # per 100g dry
    "yaourt": 125,                                   # per pot
    "blanc d'oeuf": 30,                              # per egg white
    "yaourt brassé": 125,                             # per pot
    "tomates pelées": 400,                             # per can
    "blanc de poireau": 50,                             # per section
    "laitue": 200,                                     # per head
    "yaourts": 125,                                     # per pot
    "petits beurre": 10,                                # per cookie
    "yaourts brassés": 125,                              # per pot
    "beurre allégé": 10,                                 # per pat
    "beurre demi-sel": 10,                               # per pat
    "purée de tomate": 400                                # per can
}


In [34]:
spec_ing = add_weights(spec_ing, spec_ing_weights_vege)

In [35]:
standalized_recipes = standardize_recipes(filtered_recipes, spec_ing)
len(standalized_recipes)

969

In [36]:
standalized_recipes = scale_ecv(standalized_recipes)

In [37]:
# Check
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(standalized_recipes, f, ensure_ascii=False, indent=4)