In [18]:
from pathlib import Path
import re
import os
import json
import re
import unidecode


# Introduction

In [19]:
json_file_path = 'openrecipes_mini.json'
# Check if the file exists
if not os.path.exists(json_file_path):
    raise Exception('File does not exist')
with open(json_file_path, 'r') as file:
    lines = json.load(file)

# Check all unique keys
keys = set()
for line in lines:
    for key in line.keys():
        keys.add(key)

print(keys)

{'url', 'ingredients', 'cookTime', 'datePublished', 'description', 'name', 'recipeYield', '_id', 'source', 'prepTime', 'totalTime', 'recipeCategory', 'creator', 'dateModified', 'image', 'ts'}


In [20]:
#get percentage of recipes with all keys
for key in keys:
    count = 0
    for line in lines:
        if key in line:
            count += 1
    print(key, count/len(lines))


url 1.0
ingredients 1.0
cookTime 0.9618815896188159
datePublished 0.6277372262773723
description 0.9252500675858341
name 1.0
recipeYield 0.9972965666396323
_id 1.0
source 1.0
prepTime 0.9731008380643417
totalTime 0.03663152203298189
recipeCategory 0.03555014868883482
creator 0.036226007028926734
dateModified 0.019059205190592052
image 0.986212489862125
ts 1.0


In [21]:
# Print 3 random examples for each key
import random   
for key in keys:
    print(key)
    i = 0
    while i < 3:
        try:
            print(random.choice(lines)[key])
            i += 1
        except KeyError:
            continue
    print('\n')

url
http://tastykitchen.com/recipes/desserts/jujube-cake/
http://www.bbc.co.uk/food/recipes/trufflepanacotta_10350
http://www.aspicyperspective.com/2010/06/best-things-in-life.html


ingredients
2 pounds 2 pounds
2 Tablespoons 2 Tablespoons
1 cup 1 cup
1 teaspoon 1 teaspoon
1 Tablespoon 1 Tablespoon
¼ teaspoons ¼ teaspoons
4 Tablespoons 4 Tablespoons
4 skinless chicken breasts
1 onion
1 carrot
1 stick celery
1 bay leaf
6 black peppercorns
30g/1oz butter
30g/1oz flour
150ml/¼ pint white wine
1 free-range egg yolk
2-3 tbsp double cream
salt and freshly ground black pepper
squeeze lemon juice
2 tbsp chopped fresh parsley
, to serve mashed potato
1 whole 1 whole
4 cups 4 cups
2 stalks 2 stalks
3 sticks 3 sticks
1 whole 1 whole
2 Tablespoons 2 Tablespoons
1 package 1 package


cookTime
PT30M
PT
PT30M


datePublished
2009-07-14
2010-07-23
2011-08-12


description
None
You’ll need to start this recipe the day before you want to eat it, but the subtly smoked, meltingly tender lamb is so worth 

A lot of preprocessing to do

# Preprocessing

In [22]:
def test_cleaning_function(func, lines, column, n=10):
    # Randomly shuffle lines and select first n lines
    random.shuffle(lines)
    
    # Initialize counter
    i = 0

    for line in lines:
        if column in line:
            try:
                print(line[column], ' -> ', func(line[column]))
                i += 1
            except KeyError:
                continue 
        if i >= n:
            break

## For the yield
Keep the int values, ignore drinks and recipes for more than 12 people

In [23]:
# 10 examples of recipeYield
for i,line in enumerate(lines):
    if 'recipeYield' in line:
        print(line['recipeYield'])
    if i > 10:
        break

12
12
8
Serves 1.
12
2
12
12
18
12


In [24]:
def clean_yield(field):
    if field is None:
        return None
    
    # Extract all numbers from the string
    numbers = re.findall(r'\b\d+\b', field)

    if len(numbers) == 1:
        # Single number
        num = int(numbers[0])
        return num if num < 13 else None
    elif len(numbers) == 2:
        # Two numbers - calculate the average
        num1, num2 = map(int, numbers)
        avg = (num1 + num2) / 2
        return int(avg) if avg < 13 else None
    else:
        # No numbers or pattern not matched
        return None

In [25]:
# 10 examples of cleaned recipeYield
test_cleaning_function(clean_yield, lines, 'recipeYield', 10)


2  ->  2
Serves 1  ->  1
4  ->  4
6  ->  6
18  ->  None
4  ->  4
Serves 4  ->  4
8  ->  8
Serves 6-10  ->  8
Serves 4  ->  4


## For time
transform the format into int minutes

In [26]:
def time_to_min(time_str):
    if time_str is None:
        return None
    # Extract hours and minutes using regular expression
    match = re.match(r'PT(\d+H)?(\d+M)?', time_str)
    if not match:
        return None

    hours, minutes = match.groups()

    # Convert hours and minutes to integers
    total_minutes = 0
    if hours:
        total_minutes += int(hours[:-1]) * 60  # Remove 'H' and convert to minutes
    if minutes:
        total_minutes += int(minutes[:-1])     # Remove 'M'
    
    if total_minutes == 0:
        return None
    
    return total_minutes

In [27]:
print('cleaned totalTime')
test_cleaning_function(time_to_min, lines, 'totalTime', 2)
print('cleaned prepTime')
test_cleaning_function(time_to_min, lines, 'prepTime', 2)   
print('cleaned cookTime')       
test_cleaning_function(time_to_min, lines, 'cookTime', 2)

cleaned totalTime
PT30M  ->  30
None  ->  None
cleaned prepTime
PT15M  ->  15
PT50M  ->  50
cleaned cookTime
PT1H  ->  60
PT30M  ->  30


## For ingredients
This is the hard part of the cleanup.

In [28]:
# 1 random ingredient example  
ingredients = random.choice(lines)['ingredients']
print(ingredients)


morello cherries 
good french brandy
(to every 450g/1lb of cherries allow half the quantity of pounded sugarsugar


In [29]:
def remove_leading(text):
    prefixes = ['.25oz ', '.5oz ', '.75oz ', '.5fl oz ', '/½ ', '/¼ ', 'of ', '¼oz ', '¼ oz ','½oz ', '½ oz ', 'oz ', '1oz ', '1 oz ', '2oz ', '2 oz ', '1/2 oz ', '1/2oz ',  '3oz ', '3 oz ', '4oz ', '4 oz ', '5oz ', '5 oz ', '6oz ', '6 oz ', '7oz ', '7 oz ', '8oz ', '8 oz ', '9oz ', '9 oz ', '10oz ', '10 oz ',' ','. ']
    for prefix in prefixes:
        if text.startswith(prefix):
            return text[len(prefix):]
    return text

def unicode_to_float(frac_str):
    # Convert vulgar fractions like ½ to floats like 0.5
    vulgar_fracs = {
        '½': 0.5,
        '⅓': 0.3333,
        '⅔': 0.6666,
        '¼': 0.25,
        '¾': 0.75
    }
    return vulgar_fracs.get(frac_str, frac_str)

def strfrac_to_float(frac_str):
    # Convert string fractions like '1/2' to floats like 0.5
    if '/' not in frac_str:
        return float(frac_str)
    num, denom = frac_str.split('/')
    return float(num) / float(denom)

def add_floats_match(match):
    # Convert the matched group to a float
    int_nb = int(match.group(1))
    frac_nb = unicode_to_float(match.group(2))
    if isinstance(frac_nb, float):
        return str(int_nb + frac_nb)
    else:
        return match.group(0)

def is_repeating(ingredient_str):
    pattern = re.compile(r'(\b[\S\s]+?\b)(?:\s+\1\b|\n\s*\1\b)')
    return pattern.match(ingredient_str) is not None   

def parse_ingredient(ingredient_str):

    if is_repeating(ingredient_str):
        return (None, None, None)
    
    # Check if the string is fully alphabetic
    if ingredient_str.replace(" ", "").isalpha():
        return (None, None, ingredient_str)

    # Convert leading vulgar fraction to a float
    clean_str = re.sub(r'^([½⅓⅔¼¾])\s*', lambda m: str(unicode_to_float(m.group(1))), ingredient_str)

    # Remove any leading non-numeric text
    clean_str = re.sub(r'^[^\d]+', '', clean_str)

    # Handle combined numbers like "4 ½" or "2¾"
    clean_str = re.sub(r'(\d+)\s*+(½|⅓|⅔|¼|¾|\d+/\d+)', add_floats_match, clean_str)

    # Pattern for fractional quantities like "½ tsp salt"
    pattern_fractional = r'([^\d\s]*\s*\d+/\d+|[^\d\s]+)\s*([a-zA-Z]+)\s*(.*)'
    # Pattern for quantities with units like "250g/9oz butter"
    pattern_dual_unit = r'\b(\d+\.?\d*)\s*([a-zA-Z]+)(?:\/\d+\s*[a-zA-Z]*)?\s*(.*)'

    # Try matching the fractional pattern
    match = re.match(pattern_fractional, clean_str.strip())
    if match:
        value, unit, ingredient = match.groups()
        return (strfrac_to_float(value), unit, remove_leading(ingredient))

    # Try matching the dual unit pattern
    match = re.match(pattern_dual_unit, clean_str.strip())
    if match:
        value, unit, ingredient = match.groups()
        return (strfrac_to_float(value), unit, remove_leading(ingredient))

    # Return None, None, ingredient if no pattern matches
    return (None, None, ingredient_str)

In [30]:
# 10 examples of cleaned ingredients
for i,line in enumerate(lines):
    if 'ingredients' in line:
        for ingredient in line['ingredients'].split('\n')[1:]:
            print(ingredient, ' -> ', parse_ingredient(ingredient))
    if i > 1:
        break

1 small onion (or 2 shallots) chopped  ->  (1.0, 'small', 'onion (or 2 shallots) chopped')
1 cup Panko breadcrumbs  ->  (1.0, 'cup', 'Panko breadcrumbs')
3 celery roots  ->  (3.0, 'celery', 'roots')
3 Honey Crisp apples (or Jonagold)  ->  (3.0, 'Honey', 'Crisp apples (or Jonagold)')
1 2/3 cup heavy cream  ->  (None, None, '1 2/3 cup heavy cream')
2 tsp. kosher salt  ->  (2.0, 'tsp', 'kosher salt')
½ tsp black pepper  ->  (0.5, 'tsp', 'black pepper')
1 ½ tsp. pumpkin pie spice  ->  (1.5, 'tsp', 'pumpkin pie spice')
8 oz. Gouda cheese, shredded  ->  (8.0, 'oz', 'Gouda cheese, shredded')
1 Tb. butter for dish  ->  (1.0, 'Tb', 'butter for dish')
3 clove garlic  ->  (3.0, 'clove', 'garlic')
1 medium onion  ->  (1.0, 'medium', 'onion')
2 tablespoon olive oil  ->  (2.0, 'tablespoon', 'olive oil')
½ cup fresh basil  ->  (0.5, 'cup', 'fresh basil')
½ cup Parmesan cheese  ->  (0.5, 'cup', 'Parmesan cheese')
50ml/2fl oz water   ->  (50.0, 'ml', 'water')
1 star anise  ->  (1.0, 'star', 'anise')
2.

In [31]:
#Obtained from listing all the uniques units from parsed ingredients ranked by frequency

valid_unit_list = ['g',
 'tbsp',
 'tsp',
 'cup',
 'ml',
 'teaspoon',
 'cups',
 'tablespoon',
 'Tb',
 'tablespoons',
 'Tablespoons',
 'teaspoons',
 'ounces',
 'oz',
 'sprigs',
 'slices',
 'Tablespoon',
 'clove',
 'kg',
 'sprig',
 'pound',
 'cm',
 'stick',
 'spring',
 'sticks',
 'litre',
 'can',
 'lb',
 'litres',
 'ounce',
 'pinch',
 'pounds',
 'handful',
 'lbs',
 'cans',
 'jar',
 'pint',
 'fennel',
 'bottle',
 'Tbsp',
 'grams',
 'Tbs',
 'L',
 'lbs']

def normalize_unit(unit):
    unit_mapping = {
        'tablespoon': 'tbsp',
        'Tablespoon': 'tbsp',
        'Tablespoons': 'tbsp',
        'tablespoons': 'tbsp',
        'Tb': 'tbsp',
        'Tbs': 'tbsp',
        'Tbsp': 'tbsp',
        'teaspoon': 'tsp',
        'teaspoons': 'tsp',
        'cup': 'cups',
        'milliliters': 'ml',
        'litre': 'L',
        'litres': 'L',
        'ounce': 'oz',
        'ounces': 'oz',
        'kilograms': 'kg',
        'pound': 'lb',
        'pounds': 'lb',
        'lbs': 'lb',
        'centimeters': 'cm',
        'stick': 'sticks',
        'sprig': 'sprigs',
        'can': 'cans',
        'pinch': 'pinches',
        'handful': 'handfuls',
        'jar': 'jars',
        'pint': 'pints',
        'grams': 'g'
    }
    return unit_mapping.get(unit, unit)

def group_ingredients(ingredient): 
    if 'butter' in ingredient:
        return 'butter'
    if 'eggs' in ingredient or 'egg' in ingredient:
        return 'eggs'
    if 'flour' in ingredient:
        return 'flour'
    if 'caster sugar' in ingredient:
        return 'sugar'
    if 'mint' in ingredient:
        return 'mint'
    if 'parsley' in ingredient:
        return 'parsley'
    
    
    return ingredient

def clean_ingredients(ingredients_str):
    """
    Returns a list of cleaned ingredients.
    """
    ingredients = ingredients_str.split('\n')[1:]
    quantities, units, names = [], [], []
    for ingredient in ingredients:
        
        quantity, unit, name = parse_ingredient(ingredient)
        if quantity is None:
            continue
        quantities.append(quantity)

        if unit not in valid_unit_list:
            name = unit + ' ' + name
            unit = None
        else:
            unit = normalize_unit(unit)   
        units.append(unit)

        name = name.strip()
        name = group_ingredients(name)
        names.append(name)

    return quantities, units, names

# 10 examples of cleaned ingredients
ings = random.choice(lines)['ingredients']
print(ings)
print(clean_ingredients(ings)) 

4-½ cups 4-½ cups
2 teaspoons 2 teaspoons
½ teaspoons ½ teaspoons
1-½ cup 1-½ cup
1-½ cup 1-½ cup
2 cups 2 cups
2 cups 2 cups
4 cups 4 cups
5 whole 5 whole
2 teaspoons 2 teaspoons
([], [], [])


## For categories 

In [32]:
#list all categories and count
categories = set()
for line in lines:
    if 'recipeCategory' in line and line['recipeCategory'] is not None:
        categories.add(line['recipeCategory'])  

#print ordered count of categories  
from collections import Counter
category_count = Counter()
for line in lines:
    if 'recipeCategory' in line and line['recipeCategory'] is not None:
        category_count[line['recipeCategory']] += 1
print(category_count.most_common()) 

def normalize_categ(category):
    category.strip()
    category = category.lower()

    if ("main" in category) or ("dinner" in category) or ('lunch' in category):
        return "main"
    if ("cookie" in category) or ('dessert' in category) or ('cake' in category):
        return "dessert"
    if "side" in category:
        return "side dish"
    if "appetizer" in category:
        return "appetizer"
    if "soup" in category:
        return "soup"
    if "water" in category:
        return "water"
    
    return category

normalized_categ_count = Counter()
for line in lines:
    if 'recipeCategory' in line and line['recipeCategory'] is not None:
        normalized_categ_count[normalize_categ(line['recipeCategory'])] += 1
print(normalized_categ_count.most_common())


[('Breakfast', 54), ('Main Dishes', 18), ('Dessert', 12), ('Desserts', 12), ('breakfast', 12), ('Appetizers', 11), ('Bread', 11), ('Vegetarian Main Course', 11), ('Appetizer', 8), ('Soup', 8), ('Miscellaneous', 7), ('Salad', 6), ('Cookies/Brownies', 6), ('vegetarian main course', 5), ('Soups', 5), ('Lunch', 4), ('Cookies', 3), ('Dinner', 3), ('Cookie', 3), ('dessert', 2), ('soup', 2), ('Side Dishes', 2), ('Odds and Ends', 2), ('snack', 2), ('Snack', 2), ('Baked Good', 1), ('Baked Goods', 1), ('Drink', 1), ('appetizer', 1), ('Vegetarian main course', 1), ('BreadAppetizers', 1), ('Cakes/Cupcakes', 1), ('MiscellaneousBreakfast', 1), ('Cake', 1), ('odd and ends', 1), ('Side Dish', 1), ('bread', 1), ('Vegetarian Dinner', 1)]
[('breakfast', 66), ('main', 43), ('dessert', 40), ('appetizer', 21), ('soup', 15), ('bread', 12), ('miscellaneous', 7), ('salad', 6), ('snack', 4), ('side dish', 3), ('odds and ends', 2), ('baked good', 1), ('baked goods', 1), ('drink', 1), ('miscellaneousbreakfast', 1

some imperfections left, but it will be sorted out later

# Merge


In [33]:
#get he counts for most common ingredients
from collections import Counter
from itertools import chain

ingredients = list(chain.from_iterable([clean_ingredients(line['ingredients'])[2] for line in lines]))
counts = Counter(ingredients)
print(counts.most_common(200))
#200 most common ingredients in a list 
common_ingredients = [ing for ing,ct in counts.most_common(200)]

[('butter', 1644), ('eggs', 1524), ('sugar', 1205), ('flour', 1031), ('olive oil', 808), ('garlic', 622), ('salt', 579), ('double cream', 436), ('lemon', 398), ('parsley', 322), ('water', 300), ('milk', 283), ('baking powder', 250), ('onion', 244), ('vegetable oil', 198), ('icing sugar', 184), ('cinnamon', 172), ('vanilla extract', 172), ('honey', 163), ('mint', 154), ('onions', 148), ('chicken stock', 144), ('lemon juice', 132), ('fresh thyme', 130), ('white wine vinegar', 128), ('extra virgin olive oil', 122), ('lime', 121), ('shallots', 120), ('celery', 119), ('orange', 119), ('Sugar', 118), ('bay leaf', 115), ('vanilla pod', 112), ('baking soda', 112), ('Salt', 110), ('cumin', 101), ('kosher salt', 98), ('ground cumin', 94), ('granulated sugar', 94), ('cloves garlic', 93), ('thyme', 91), ('Dijon mustard', 88), ('brown sugar', 87), ('Butter', 86), ('ground cinnamon', 83), ('tomato purée', 83), ('white wine', 82), ('carrots', 82), ('carrot', 81), ('Olive Oil', 81), ('shallot', 78), (

In [34]:
#get he counts for most common categories
from collections import Counter
from itertools import chain

categs = []    
for line in lines:
    if 'recipeCategory' in line and line['recipeCategory'] is not None:
        categs.append(normalize_categ(line['recipeCategory'])) 
counts = Counter(categs)
print(counts.most_common(10))
#200 most common ingredients in a list 
common_categs = [ing for ing,ct in counts.most_common(10)]

[('breakfast', 66), ('main', 43), ('dessert', 40), ('appetizer', 21), ('soup', 15), ('bread', 12), ('miscellaneous', 7), ('salad', 6), ('snack', 4), ('side dish', 3)]


In [43]:
common_ingredients

['butter',
 'eggs',
 'sugar',
 'flour',
 'olive oil',
 'garlic',
 'salt',
 'double cream',
 'lemon',
 'parsley',
 'water',
 'milk',
 'baking powder',
 'onion',
 'vegetable oil',
 'icing sugar',
 'cinnamon',
 'vanilla extract',
 'honey',
 'mint',
 'onions',
 'chicken stock',
 'lemon juice',
 'fresh thyme',
 'white wine vinegar',
 'extra virgin olive oil',
 'lime',
 'shallots',
 'celery',
 'orange',
 'Sugar',
 'bay leaf',
 'vanilla pod',
 'baking soda',
 'Salt',
 'cumin',
 'kosher salt',
 'ground cumin',
 'granulated sugar',
 'cloves garlic',
 'thyme',
 'Dijon mustard',
 'brown sugar',
 'Butter',
 'ground cinnamon',
 'tomato purée',
 'white wine',
 'carrots',
 'carrot',
 'Olive Oil',
 'shallot',
 'soy sauce',
 'red chilli',
 'cocoa powder',
 'bay leaves',
 'heavy cream',
 'dry white wine',
 'red wine vinegar',
 'ground almonds',
 'dark chocolate',
 'sunflower oil',
 'whole milk',
 'vanilla',
 'lemons',
 'cloves',
 'ground turmeric',
 'balsamic vinegar',
 'red wine',
 'ground coriander',


### Translation

Since the goal of the database is to give ideas for meals, the name being in english is fine, but the ingredients should be in French to have consistency with the other db

In [35]:
LANG = 'fr' #or 'en'
translate_ing = True
translate_categ = False

import deepl

auth_key = '9093d919-3022-b8c7-19ba-93ceff08f8d7:fx'
translator = deepl.Translator(auth_key)
if translate_ing:       
    translated_ingredients = []
    for ingredient in common_ingredients:
        if len(ingredient) > 0:
            translated_ingredients.append(translator.translate_text(ingredient, target_lang='FR').text)
            print(ingredient, ' -> ', translated_ingredients[-1])
if translate_categ:
    translated_categories = []
    for category in common_categs:
        if len(category) > 0:
            translated_categories.append(translator.translate_text(category, target_lang='FR').text)
            print(category, ' -> ', translated_categories[-1])   



butter  ->  beurre
eggs  ->  œufs
sugar  ->  sucre
flour  ->  farine
olive oil  ->  huile d'olive
garlic  ->  ail
salt  ->  sel
double cream  ->  crème double
lemon  ->  citron
parsley  ->  persil
water  ->  l'eau
milk  ->  lait
baking powder  ->  poudre à lever
onion  ->  oignon
vegetable oil  ->  huile végétale
icing sugar  ->  sucre glace
cinnamon  ->  cannelle
vanilla extract  ->  extrait de vanille
honey  ->  miel
mint  ->  tels que
onions  ->  oignons
chicken stock  ->  stock de poulet
lemon juice  ->  jus de citron
fresh thyme  ->  thym frais
white wine vinegar  ->  vinaigre de vin blanc
extra virgin olive oil  ->  huile d'olive extra vierge
lime  ->  chaux
shallots  ->  échalotes
celery  ->  céleri
orange  ->  orange
Sugar  ->  Sucre
bay leaf  ->  feuille de laurier
vanilla pod  ->  gousse de vanille
baking soda  ->  bicarbonate de soude
Salt  ->  Sel
cumin  ->  cumin
kosher salt  ->  sel casher
ground cumin  ->  cumin moulu
granulated sugar  ->  sucre cristallisé
cloves garlic

In [36]:
#handmade tweaks
translated_categories = ['petit déjeuner', 'plat', 'dessert', 'appéritif', 'soupe', 'pain', 'divers', 'salade', 'gouter', 'accompagnement']

In [37]:
translated_ingredients = ['beurre',
 'œufs',
 'sucre',
 'farine',
 "huile d'olive",
 'ail',
 'sel',
 'crème double',
 'citron',
 'persil',
 "l'eau",
 'lait',
 'poudre à lever',
 'oignon',
 'huile végétale',
 'sucre glace',
 'extrait de vanille',
 'cannelle',
 'miel',
 'tels que',
 'oignons',
 'stock de poulet',
 'jus de citron',
 'thym frais',
 'vinaigre de vin blanc',
 "huile d'olive extra vierge",
 'chaux',
 'échalotes',
 'orange',
 'céleri',
 'Sucre',
 'feuille de laurier',
 'bicarbonate de soude',
 'gousse de vanille',
 'Sel',
 'cumin',
 'sel casher',
 'sucre cristallisé',
 'cumin moulu',
 "gousses d'ail",
 'thym',
 'Moutarde de Dijon',
 'sucre roux',
 'Beurre',
 'cannelle moulue',
 'purée de tomates',
 'carottes',
 'vin blanc',
 "Huile d'olive",
 'carotte',
 'échalote',
 'sauce soja',
 'piment rouge',
 'poudre de cacao',
 'feuilles de laurier',
 'crème épaisse',
 'amandes moulues',
 'vinaigre de vin rouge',
 'vin blanc sec',
 'chocolat noir',
 'huile de tournesol',
 'vanille',
 'clous de girofle',
 'citrons',
 'lait entier',
 'curcuma moulu',
 'vinaigre balsamique',
 'coriandre moulue',
 'vin rouge',
 'Farine tout usage',
 'oignon rouge',
 "sirop d'érable",
 'graines de coriandre',
 'gingembre moulu',
 'anis étoilé',
 'poivre rouge',
 'Parmesan',
 'brandy',
 'miel clair',
 'concombre',
 'poivre noir',
 'vinaigre de xérès',
 'fromage à la crème',
 'poivre noir fraîchement moulu',
 'coriandre fraîche hachée',
 'petit oignon',
 'crème fraîche',
 'gélatine',
 'garam masala',
 'sel de mer',
 'gros oignon',
 'oignon moyen',
 'semblait',
 'noix de muscade',
 'bicarbonate de soude',
 'cayenne pepper',
 'paprika',
 'huile de sésame',
 'raspberries',
 'bouillon de légumes',
 'romarin frais',
 "gousses d'ail, émincées",
 'sucre en poudre',
 'sucre roux doux',
 'piment',
 "sirop d'or",
 'crème à fouetter',
 'Crème épaisse',
 'Moutarde anglaise',
 'grains de poivre noir',
 'tomates',
 'Œufs entiers',
 "gousses d'ail",
 'raisins secs',
 'mascarpone',
 'tomates cerises',
 'romarin',
 'raisins secs',
 'vinaigre de cidre',
 'cardamome',
 'curcuma',
 'graines de sésame',
 'mayonnaise',
 'câpres',
 'bouillon de bœuf',
 'crème fraîche',
 'asperges',
 'lime juice',
 'amandes effilées',
 'jus de citron frais',
 'huile',
 'fraises',
 'sauce de poisson',
 'graines de fenouil',
 'Lait entier',
 'lait entier',
 'sucre demerara',
 'lard',
 "gousses d'ail, émincées",
 'chocolat blanc',
 'Extrait de vanille',
 'eau froide',
 'piment en poudre',
 'Sel Kosher',
 'poivre',
 'gingembre',
 'coriandre fraîche',
 'Poudre à lever',
 'Vanille',
 'pinte de crème double',
 'les citrons verts',
 'Sauce Worcestershire',
 'vinaigre de riz',
 'Sucre brun',
 'eau bouillante',
 'paprika fumé',
 'pignons de pin',
 'ciboulette fraîche hachée',
 'rhubarbe',
 'origan séché',
 'pois surgelés',
 'riz basmati',
 'lait de coco',
 'chapelure',
 'échalotes à la banane',
 'Farine',
 'port',
 'sucanat',
 'huile de colza',
 'Œuf entier',
 'petit oignon rouge',
 'épices mélangées',
 'noix',
 'de thym frais haché',
 'pommes de terre nouvelles',
 'noix de muscade moulue',
 'pommes de terre',
 'cardamome verte',
 'poireaux',
 'oranges',
 'groseilles',
 'piment vert',
 'poudre de curry',
 'sucre naturel',
 'bouillon de poisson',
 'jus de citron vert frais',
 'champignons de Paris',
 'huile de canola',
 'Yaourt à la grecque',
 'poivre vert',
 'épinards',
 'morceau de gingembre frais',
 'Bicarbonate de soude',
 'pois frais',
 'eau chaude',
 'gousses de vanille',
 "jus d'orange",
 'aneth frais haché',
 "huile d'olive extra vierge"]

def clean_translated_ingredients(translated_ings):
    clean = []
    for ing in translated_ings:
        #remove accents
        ing = unidecode.unidecode(ing)
        #remove capital letters 
        ing = ing.lower()
        #remove 'de ', "l' ", 'du ', 'des ', "d' ", 'le ', 'les ' if leading
        for prefix in ['de ', "l'", 'du ', 'des ', "d'", 'le ', 'les ']:
            if ing.startswith(prefix):
                ing = ing[len(prefix):]
        #oe
        ing = re.sub(r'œ', 'oe', ing)
        clean.append(ing)
        #dele
    return clean

translated_ingredients = clean_translated_ingredients(translated_ingredients)
print(translated_ingredients) 
    
        
        

['beurre', 'oeufs', 'sucre', 'farine', "huile d'olive", 'ail', 'sel', 'creme double', 'citron', 'persil', 'eau', 'lait', 'poudre a lever', 'oignon', 'huile vegetale', 'sucre glace', 'extrait de vanille', 'cannelle', 'miel', 'tels que', 'oignons', 'stock de poulet', 'jus de citron', 'thym frais', 'vinaigre de vin blanc', "huile d'olive extra vierge", 'chaux', 'echalotes', 'orange', 'celeri', 'sucre', 'feuille de laurier', 'bicarbonate de soude', 'gousse de vanille', 'sel', 'cumin', 'sel casher', 'sucre cristallise', 'cumin moulu', "gousses d'ail", 'thym', 'moutarde de dijon', 'sucre roux', 'beurre', 'cannelle moulue', 'puree de tomates', 'carottes', 'vin blanc', "huile d'olive", 'carotte', 'echalote', 'sauce soja', 'piment rouge', 'poudre de cacao', 'feuilles de laurier', 'creme epaisse', 'amandes moulues', 'vinaigre de vin rouge', 'vin blanc sec', 'chocolat noir', 'huile de tournesol', 'vanille', 'clous de girofle', 'citrons', 'lait entier', 'curcuma moulu', 'vinaigre balsamique', 'cor

In [38]:
#replace '' by 'ginger' in list
if '' in common_ingredients:   
    common_ingredients[common_ingredients.index('')] = 'ginger' 

#remove entry 73 of common_ingredients
del common_ingredients[73]

for i in range(200-1):
    print(i, common_ingredients[i], ' -> ', translated_ingredients[i])

0 butter  ->  beurre
1 eggs  ->  oeufs
2 sugar  ->  sucre
3 flour  ->  farine
4 olive oil  ->  huile d'olive
5 garlic  ->  ail
6 salt  ->  sel
7 double cream  ->  creme double
8 lemon  ->  citron
9 parsley  ->  persil
10 water  ->  eau
11 milk  ->  lait
12 baking powder  ->  poudre a lever
13 onion  ->  oignon
14 vegetable oil  ->  huile vegetale
15 icing sugar  ->  sucre glace
16 cinnamon  ->  extrait de vanille
17 vanilla extract  ->  cannelle
18 honey  ->  miel
19 mint  ->  tels que
20 onions  ->  oignons
21 chicken stock  ->  stock de poulet
22 lemon juice  ->  jus de citron
23 fresh thyme  ->  thym frais
24 white wine vinegar  ->  vinaigre de vin blanc
25 extra virgin olive oil  ->  huile d'olive extra vierge
26 lime  ->  chaux
27 shallots  ->  echalotes
28 celery  ->  orange
29 orange  ->  celeri
30 Sugar  ->  sucre
31 bay leaf  ->  feuille de laurier
32 vanilla pod  ->  bicarbonate de soude
33 baking soda  ->  gousse de vanille
34 Salt  ->  sel
35 cumin  ->  cumin
36 kosher salt

Handmade translation for the units

In [39]:
def translation_unit_mapping(english_unit):
    translation_map = {
        None: None,
        'g': 'g',
        'ml': 'ml',
        'tbsp': 'c. a s.',
        'tsp': 'c. a c.',
        'cans': 'boites',
        'sticks': 'batonnets',
        'clove': 'gousse',
        'jars': 'pots',
        'slices': 'tranches',
        'bottle': 'bouteille',
        'cm': 'cm',
        'sprigs': 'brins',
        'L': 'L',
        'pinches': 'pincees',
        'kg': 'kg',
        'fennel': 'fenouil',
        'handfuls': 'poignees',
    }
    return translation_map.get(english_unit, english_unit)

In [40]:
def uncommon_ingredient(ingredient, common_ingredients):
    return ingredient not in common_ingredients

def convert_to_metric(value, unit):
    conversion_factors = {
        'cups': {'factor': 236.588, 'metric_unit': 'ml'},
        'oz': {'factor': 28.3495, 'metric_unit': 'g'},
        'lb': {'factor': 453.592, 'metric_unit': 'g'},
        'pints': {'factor': 473.176, 'metric_unit': 'ml'},
    }
    if unit in conversion_factors:
        factor = conversion_factors[unit]['factor']
        metric_unit = conversion_factors[unit]['metric_unit']
        return value * factor, metric_unit
    else:
        return value, unit

def get_difficulty_from_nb_ingredients(nb_ingredients):
    if nb_ingredients <= 5:
        return '1'
    elif nb_ingredients <= 10:
        return '2'
    else:
        return '3'   

unique_units = []
#200mb into memory, its ok
cleaned_lines = []
json_file_path = 'openrecipes_mini.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)
    for i, line in enumerate(data):
        if i == 0:
            continue
        id = i
        #get raw data
        raw_ingredients = line.get('ingredients')
        recipeYield = line.get('recipeYield')
        prepTime = line.get('prepTime')
        url = line.get('url')
        image = line.get('image')
        totalTime = line.get('totalTime')
        cookTime = line.get('cookTime')
        name = line.get('name') 
        source = line.get('source', None)
        category = line.get('recipeCategory', None)
        
        

        #process data
        ingredient_values, ingredient_units, ingredients = clean_ingredients(raw_ingredients)
        if any(ingredient is None for ingredient in ingredients):
            continue

        difficulty = get_difficulty_from_nb_ingredients(len(ingredients))
        prepTime = time_to_min(prepTime)
        cookTime = time_to_min(cookTime)
        totalTime = time_to_min(totalTime)
        if ((prepTime is not None) and (cookTime is not None)) and totalTime is None:
            totalTime = prepTime + cookTime
        elif (prepTime is not None) and (cookTime is None):
            totalTime = prepTime
        elif (prepTime is None) and (cookTime is not None):
            totalTime = cookTime
        if source is None:
            source = 'openrecipes'
        recipeYield = clean_yield(recipeYield)

        #convert to metric system
        for i, value in enumerate(ingredient_values):
            ingredient_values[i], ingredient_units[i] = convert_to_metric(value, ingredient_units[i])


            #delete 
            if ingredient_units[i] not in unique_units:
                unique_units.append(ingredient_units[i])



        if category not in common_categs:
            category = None

        #deletion checks
        if totalTime is None:
            continue
        if int(totalTime) > 240:
            continue
        if (len(ingredients) > 25) or (len(ingredients) < 3):
            continue
        if recipeYield is None:
            continue
        if any(uncommon_ingredient(ingredient, common_ingredients) for ingredient in ingredients):
            continue
        # if list(set(ingredients)) != ingredients:
        #     continue
        
        #translate ingredients to french if LANG == 'fr'
        assert LANG in ['fr', 'en']
        if LANG == 'fr':
            #make the french translation by mapping the id between ingredients and translated_ingredients
            ingredients = [translated_ingredients[common_ingredients.index(ingredient)] for ingredient in ingredients]
            if category is not None:
                category = translated_categories[common_categs.index(category)]
            #translate units
            ingredient_units = [translation_unit_mapping(unit) for unit in ingredient_units]


        cleaned_lines.append({
            'id': id,
            'name': name,
            'source': source,
            'category': category,
            'url': url,
            'image': image,
            'servings': recipeYield,
            'time': totalTime,
            'difficulty': difficulty,
            'ingredients': ingredients,
            'ingredient_values': ingredient_values,
            'ingredient_units': ingredient_units
        })

#write to json
with open('cleaned_openrecipes_mini.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_lines, f, ensure_ascii=False, indent=2)

In [41]:
#number of lines in raw data
print(len(lines))
#number of lines in cleaned data
print(len(cleaned_lines)) 
print(len(cleaned_lines)/len(lines)*100, '% of recipes kept by cleanup') 
#example of cleaned data
print('example of cleaned data :')
print(cleaned_lines[0])


7398
98
1.324682346580157 % of recipes kept by cleanup
example of cleaned data :
{'id': 126, 'name': 'Salad cream', 'source': 'bbcfood', 'category': None, 'url': 'http://www.bbc.co.uk/food/recipes/salad_cream_31658', 'image': None, 'servings': 8, 'time': 30, 'difficulty': '2', 'ingredients': ['moutarde anglaise', 'citron', 'sucre', 'vinaigre de vin blanc', 'creme double', "huile d'olive"], 'ingredient_values': [2.0, 0.5, 1.0, 3.0, 150.0, 150.0], 'ingredient_units': ['c. a s.', None, 'c. a s.', 'c. a s.', 'ml', 'ml']}


In [None]:
import ast

ings = []
with open('openrecipes.json', 'r') as file:
    for line in file:
        line = ast.literal_eval(line)
        ings += [clean_ingredients(ing) for ing in line['ingredients']]
        print(ings)

counts = Counter(ings)
print(counts.most_common(200))
#200 most common ingredients in a list 
common_ingredients = [ing for ing,ct in counts.most_common(200)]