In [136]:
from pathlib import Path
import re
import os
import json
import re


# Introduction

In [137]:
json_file_path = 'openrecipes_mini.json'
# Check if the file exists
if not os.path.exists(json_file_path):
    raise Exception('File does not exist')
with open(json_file_path, 'r') as file:
    lines = json.load(file)

# Check all unique keys
keys = set()
for line in lines:
    for key in line.keys():
        keys.add(key)

print(keys)

{'description', 'source', 'datePublished', 'recipeYield', 'recipeCategory', 'dateModified', 'ts', 'ingredients', '_id', 'image', 'totalTime', 'creator', 'url', 'prepTime', 'cookTime', 'name'}


In [138]:
#get percentage of recipes with all keys
for key in keys:
    count = 0
    for line in lines:
        if key in line:
            count += 1
    print(key, count/len(lines))


description 0.9252500675858341
source 1.0
datePublished 0.6277372262773723
recipeYield 0.9972965666396323
recipeCategory 0.03555014868883482
dateModified 0.019059205190592052
ts 1.0
ingredients 1.0
_id 1.0
image 0.986212489862125
totalTime 0.03663152203298189
creator 0.036226007028926734
url 1.0
prepTime 0.9731008380643417
cookTime 0.9618815896188159
name 1.0


In [139]:
# Print 3 random examples for each key
import random   
for key in keys:
    print(key)
    i = 0
    while i < 3:
        try:
            print(random.choice(lines)[key])
            i += 1
        except KeyError:
            continue
    print('\n')

description
None
Our family recipe we made growing up. This is my all-time favorite recipe!
This was such an easy throw-together meal last night. Reminded me of the meatless days of yore.     They're cheesy, sour crea...


source
naturallyella
tastykitchen
tastykitchen


datePublished
2011-08-19
2010-08-03
2010-07-21


recipeYield
12
12
8


recipeCategory
Breakfast
Breakfast
Vegetarian Dinner


dateModified
2011-10-25T17:42:46+00:00
2012-12-10T01:36:11+00:00
2011-08-28T03:14:08+00:00


ts
{'$date': 1365278681924}
{'$date': 1365280338567}
{'$date': 1365367753577}


ingredients
6 pounds 6 pounds
½ cups ½ cups
½ cups ½ cups
2 Tablespoons 2 Tablespoons
1 teaspoon 1 teaspoon
4 Tablespoons 4 Tablespoons
1 Tablespoon 1 Tablespoon
3 teaspoons 3 teaspoons
2-½ pounds 2-½ pounds
3 Tablespoons 3 Tablespoons
2 cloves 2 cloves
5 cups 5 cups
½ whole ½ whole
1 can 1 can
1 leaf 1 leaf
1 teaspoon 1 teaspoon
½ teaspoons ½ teaspoons
1-½ pound 1-½ pound
2 cups 2 cups
2 stalks 2 stalks
2 Tablespoons 2 Table

A lot of preprocessing to do

# Preprocessing

In [140]:
def test_cleaning_function(func, lines, column, n=10):
    # Randomly shuffle lines and select first n lines
    random.shuffle(lines)
    
    # Initialize counter
    i = 0

    for line in lines:
        if column in line:
            try:
                print(line[column], ' -> ', func(line[column]))
                i += 1
            except KeyError:
                continue 
        if i >= n:
            break

## For the yield
Keep the int values, ignore drinks and recipes for more than 12 people

In [141]:
# 10 examples of recipeYield
for i,line in enumerate(lines):
    if 'recipeYield' in line:
        print(line['recipeYield'])
    if i > 10:
        break

12
12
8
Serves 1.
12
2
12
12
18
12


In [142]:
def clean_yield(field):
    if field is None:
        return None
    
    # Extract all numbers from the string
    numbers = re.findall(r'\b\d+\b', field)

    if len(numbers) == 1:
        # Single number
        num = int(numbers[0])
        return num if num < 13 else None
    elif len(numbers) == 2:
        # Two numbers - calculate the average
        num1, num2 = map(int, numbers)
        avg = (num1 + num2) / 2
        return int(avg) if avg < 13 else None
    else:
        # No numbers or pattern not matched
        return None

In [143]:
# 10 examples of cleaned recipeYield
test_cleaning_function(clean_yield, lines, 'recipeYield', 10)


8  ->  8
8  ->  8
24  ->  None
10  ->  10
Serves 4  ->  4
Serves 2  ->  2
10  ->  10
6 people  ->  6
12  ->  12
4  ->  4


## For time
transform the format into int minutes

In [144]:
def time_to_min(time_str):
    if time_str is None:
        return None
    # Extract hours and minutes using regular expression
    match = re.match(r'PT(\d+H)?(\d+M)?', time_str)
    if not match:
        return None

    hours, minutes = match.groups()

    # Convert hours and minutes to integers
    total_minutes = 0
    if hours:
        total_minutes += int(hours[:-1]) * 60  # Remove 'H' and convert to minutes
    if minutes:
        total_minutes += int(minutes[:-1])     # Remove 'M'
    
    if total_minutes == 0:
        return None
    
    return total_minutes

In [145]:
print('cleaned totalTime')
test_cleaning_function(time_to_min, lines, 'totalTime', 2)
print('cleaned prepTime')
test_cleaning_function(time_to_min, lines, 'prepTime', 2)   
print('cleaned cookTime')       
test_cleaning_function(time_to_min, lines, 'cookTime', 2)

cleaned totalTime
PT30M  ->  30
PT15M  ->  15
cleaned prepTime
PT30M  ->  30
PT10M  ->  10
cleaned cookTime
PT10M  ->  10
PT1H  ->  60


## For ingredients
This is the hard part of the cleanup.

In [146]:
# 1 random ingredient example  
ingredients = random.choice(lines)['ingredients']
print(ingredients)


1-½ cup 1-½ cup
5 whole 5 whole
2 Tablespoons 2 Tablespoons
1-½ teaspoon 1-½ teaspoon
1 teaspoon 1 teaspoon
6 whole 6 whole
6 sprigs 6 sprigs


In [147]:
def remove_leading(text):
    prefixes = ['.25oz ', '.5oz ', '.75oz ', '.5fl oz ', '/½ ', '/¼ ', 'of ', '¼oz ', '¼ oz ','½oz ', '½ oz ', 'oz ', '1oz ', '1 oz ', '2oz ', '2 oz ', '1/2 oz ', '1/2oz ',  '3oz ', '3 oz ', '4oz ', '4 oz ', '5oz ', '5 oz ', '6oz ', '6 oz ', '7oz ', '7 oz ', '8oz ', '8 oz ', '9oz ', '9 oz ', '10oz ', '10 oz ',' ','. ']
    for prefix in prefixes:
        if text.startswith(prefix):
            return text[len(prefix):]
    return text

def unicode_to_float(frac_str):
    # Convert vulgar fractions like ½ to floats like 0.5
    vulgar_fracs = {
        '½': 0.5,
        '⅓': 0.3333,
        '⅔': 0.6666,
        '¼': 0.25,
        '¾': 0.75
    }
    return vulgar_fracs.get(frac_str, frac_str)

def strfrac_to_float(frac_str):
    # Convert string fractions like '1/2' to floats like 0.5
    if '/' not in frac_str:
        return float(frac_str)
    num, denom = frac_str.split('/')
    return float(num) / float(denom)

def add_floats_match(match):
    # Convert the matched group to a float
    int_nb = int(match.group(1))
    frac_nb = unicode_to_float(match.group(2))
    if isinstance(frac_nb, float):
        return str(int_nb + frac_nb)
    else:
        return match.group(0)

def is_repeating(ingredient_str):
    pattern = re.compile(r'(\b[\S\s]+?\b)(?:\s+\1\b|\n\s*\1\b)')
    return pattern.match(ingredient_str) is not None   

def parse_ingredient(ingredient_str):

    if is_repeating(ingredient_str):
        return (None, None, None)
    
    # Check if the string is fully alphabetic
    if ingredient_str.replace(" ", "").isalpha():
        return (None, None, ingredient_str)

    # Convert leading vulgar fraction to a float
    clean_str = re.sub(r'^([½⅓⅔¼¾])\s*', lambda m: str(unicode_to_float(m.group(1))), ingredient_str)

    # Remove any leading non-numeric text
    clean_str = re.sub(r'^[^\d]+', '', clean_str)

    # Handle combined numbers like "4 ½" or "2¾"
    clean_str = re.sub(r'(\d+)\s*+(½|⅓|⅔|¼|¾|\d+/\d+)', add_floats_match, clean_str)

    # Pattern for fractional quantities like "½ tsp salt"
    pattern_fractional = r'([^\d\s]*\s*\d+/\d+|[^\d\s]+)\s*([a-zA-Z]+)\s*(.*)'
    # Pattern for quantities with units like "250g/9oz butter"
    pattern_dual_unit = r'\b(\d+\.?\d*)\s*([a-zA-Z]+)(?:\/\d+\s*[a-zA-Z]*)?\s*(.*)'

    # Try matching the fractional pattern
    match = re.match(pattern_fractional, clean_str.strip())
    if match:
        value, unit, ingredient = match.groups()
        return (strfrac_to_float(value), unit, remove_leading(ingredient))

    # Try matching the dual unit pattern
    match = re.match(pattern_dual_unit, clean_str.strip())
    if match:
        value, unit, ingredient = match.groups()
        return (strfrac_to_float(value), unit, remove_leading(ingredient))

    # Return None, None, ingredient if no pattern matches
    return (None, None, ingredient_str)

In [148]:
# 10 examples of cleaned ingredients
for i,line in enumerate(lines):
    if 'ingredients' in line:
        for ingredient in line['ingredients'].split('\n')[1:]:
            print(ingredient, ' -> ', parse_ingredient(ingredient))
    if i > 1:
        break

1 can 1 can  ->  (None, None, None)
1 can 1 can  ->  (None, None, None)
½ whole ½ whole  ->  (None, None, None)
1 jar 1 jar  ->  (None, None, None)
1 cup 1 cup  ->  (None, None, None)
¾ cups ¾ cups  ->  (None, None, None)
½ cups ½ cups  ->  (None, None, None)
½ teaspoons ½ teaspoons  ->  (None, None, None)
1 cup 1 cup  ->  (None, None, None)
1 teaspoon 1 teaspoon  ->  (None, None, None)
1 Tablespoon 1 Tablespoon  ->  (None, None, None)
3 cloves 3 cloves  ->  (None, None, None)
¼ cups ¼ cups  ->  (None, None, None)
3 Tablespoons 3 Tablespoons  ->  (None, None, None)
1 Tablespoon 1 Tablespoon  ->  (None, None, None)
½ teaspoons ½ teaspoons  ->  (None, None, None)
¼ teaspoons ¼ teaspoons  ->  (None, None, None)
1 pound 1 pound  ->  (None, None, None)
8 ounces, weight 8 ounces, weight  ->  (None, None, None)
2 sticks 2 sticks  ->  (None, None, None)
1 stick 1 stick  ->  (None, None, None)
3 cups 3 cups  ->  (None, None, None)
6 whole 6 whole  ->  (None, None, None)
1 teaspoon 1 teaspoon  -

In [149]:
#Obtained from listing all the uniques units from parsed ingredients ranked by frequency

valid_unit_list = ['g',
 'tbsp',
 'tsp',
 'cup',
 'ml',
 'teaspoon',
 'cups',
 'tablespoon',
 'Tb',
 'tablespoons',
 'Tablespoons',
 'teaspoons',
 'ounces',
 'oz',
 'sprigs',
 'slices',
 'Tablespoon',
 'clove',
 'kg',
 'sprig',
 'pound',
 'cm',
 'stick',
 'spring',
 'sticks',
 'litre',
 'can',
 'lb',
 'litres',
 'ounce',
 'pinch',
 'pounds',
 'handful',
 'lbs',
 'cans',
 'jar',
 'pint',
 'fennel',
 'bottle',
 'Tbsp',
 'grams',
 'Tbs',
 'L',
 'lbs']

def normalize_unit(unit):
    unit_mapping = {
        'tablespoon': 'tbsp',
        'Tablespoon': 'tbsp',
        'Tablespoons': 'tbsp',
        'Tb': 'tbsp',
        'Tbs': 'tbsp',
        'Tbsp': 'tbsp',
        'teaspoon': 'tsp',
        'teaspoons': 'tsp',
        'cup': 'cups',
        'milliliters': 'ml',
        'litre': 'L',
        'litres': 'L',
        'ounce': 'oz',
        'ounces': 'oz',
        'kilograms': 'kg',
        'pound': 'lb',
        'lbs': 'lb',
        'centimeters': 'cm',
        'stick': 'sticks',
        'sprig': 'sprigs',
        'can': 'cans',
        'pinch': 'pinches',
        'handful': 'handfuls',
        'jar': 'jars',
        'pint': 'pints',
    }
    return unit_mapping.get(unit, unit)

def group_ingredients(ingredient): 
    if 'butter' in ingredient:
        return 'butter'
    if 'eggs' in ingredient or 'egg' in ingredient:
        return 'eggs'
    if 'flour' in ingredient:
        return 'flour'
    if 'caster sugar' in ingredient:
        return 'sugar'
    if 'mint' in ingredient:
        return 'mint'
    if 'parsley' in ingredient:
        return 'parsley'
    
    
    return ingredient

def clean_ingredients(ingredients_str):
    """
    Returns a list of cleaned ingredients.
    """
    ingredients = ingredients_str.split('\n')[1:]
    quantities, units, names = [], [], []
    for ingredient in ingredients:
        
        quantity, unit, name = parse_ingredient(ingredient)
        if quantity is None:
            continue
        quantities.append(quantity)

        if unit not in valid_unit_list:
            name = unit + ' ' + name
            unit = None
        else:
            unit = normalize_unit(unit)   
        units.append(unit)

        name = name.strip()
        name = group_ingredients(name)
        names.append(name)

    return quantities, units, names

# 10 examples of cleaned ingredients
ings = random.choice(lines)['ingredients']
print(ings)
print(clean_ingredients(ings)) 

1 cup 1 cup
1-¼ cup 1-¼ cup
2 cups 2 cups
¾ cups ¾ cups
2 teaspoons 2 teaspoons
½ cups ½ cups
½ cups ½ cups
([], [], [])


some imperfections left, but it will be sorted out later

# Merge


In [150]:
#get he counts for most common ingredients
from collections import Counter
from itertools import chain

ingredients = list(chain.from_iterable([clean_ingredients(line['ingredients'])[2] for line in lines]))
counts = Counter(ingredients)
print(counts.most_common(200))
#200 most common ingredients in a list 
common_ingredients = [ing for ing,ct in counts.most_common(200)]

[('butter', 1644), ('eggs', 1524), ('sugar', 1205), ('flour', 1031), ('olive oil', 808), ('garlic', 622), ('salt', 579), ('double cream', 436), ('lemon', 398), ('parsley', 322), ('water', 300), ('milk', 283), ('baking powder', 250), ('onion', 244), ('vegetable oil', 198), ('icing sugar', 184), ('vanilla extract', 172), ('cinnamon', 172), ('honey', 163), ('mint', 154), ('onions', 148), ('chicken stock', 144), ('lemon juice', 132), ('fresh thyme', 130), ('white wine vinegar', 128), ('extra virgin olive oil', 122), ('lime', 121), ('shallots', 120), ('orange', 119), ('celery', 119), ('Sugar', 118), ('bay leaf', 115), ('vanilla pod', 112), ('baking soda', 112), ('Salt', 110), ('cumin', 101), ('kosher salt', 98), ('granulated sugar', 94), ('ground cumin', 94), ('cloves garlic', 93), ('thyme', 91), ('Dijon mustard', 88), ('brown sugar', 87), ('Butter', 86), ('ground cinnamon', 83), ('tomato purée', 83), ('carrots', 82), ('white wine', 82), ('Olive Oil', 81), ('carrot', 81), ('shallot', 78), (

### Translation

Since the goal of the database is to give ideas for meals, the name being in english is fine, but the ingredients should be in French to have consistency with the other db

In [156]:
LANG = 'fr' #or 'en'

import deepl

#auth_key = '9093d919-3022-b8c7-19ba-93ceff08f8d7:fx'
translator = deepl.Translator(auth_key)
translated_ingredients = []
for ingredient in common_ingredients:
    if len(ingredient) > 0:
        translated_ingredients.append(translator.translate_text(ingredient, target_lang='FR').text)
        print(ingredient, ' -> ', translated_ingredients[-1])

#To keep the credits 
 


butter  ->  beurre
eggs  ->  œufs
sugar  ->  sucre
flour  ->  farine
olive oil  ->  huile d'olive
garlic  ->  ail
salt  ->  sel
double cream  ->  crème double
lemon  ->  citron
parsley  ->  persil
water  ->  l'eau
milk  ->  lait
baking powder  ->  poudre à lever
onion  ->  oignon
vegetable oil  ->  huile végétale
icing sugar  ->  sucre glace
vanilla extract  ->  extrait de vanille
cinnamon  ->  cannelle
honey  ->  miel
mint  ->  tels que
onions  ->  oignons
chicken stock  ->  stock de poulet
lemon juice  ->  jus de citron
fresh thyme  ->  thym frais
white wine vinegar  ->  vinaigre de vin blanc
extra virgin olive oil  ->  huile d'olive extra vierge
lime  ->  chaux
shallots  ->  échalotes
orange  ->  orange
celery  ->  céleri
Sugar  ->  Sucre
bay leaf  ->  feuille de laurier
vanilla pod  ->  gousse de vanille
baking soda  ->  bicarbonate de soude
Salt  ->  Sel
cumin  ->  cumin
kosher salt  ->  sel casher
granulated sugar  ->  sucre cristallisé
ground cumin  ->  cumin moulu
cloves garlic

In [158]:
translated_ingredients = ['beurre',
 'œufs',
 'sucre',
 'farine',
 "huile d'olive",
 'ail',
 'sel',
 'crème double',
 'citron',
 'persil',
 "l'eau",
 'lait',
 'poudre à lever',
 'oignon',
 'huile végétale',
 'sucre glace',
 'extrait de vanille',
 'cannelle',
 'miel',
 'tels que',
 'oignons',
 'stock de poulet',
 'jus de citron',
 'thym frais',
 'vinaigre de vin blanc',
 "huile d'olive extra vierge",
 'chaux',
 'échalotes',
 'orange',
 'céleri',
 'Sucre',
 'feuille de laurier',
 'gousse de vanille',
 'bicarbonate de soude',
 'Sel',
 'cumin',
 'sel casher',
 'sucre cristallisé',
 'cumin moulu',
 "gousses d'ail",
 'thym',
 'Moutarde de Dijon',
 'sucre roux',
 'Beurre',
 'cannelle moulue',
 'purée de tomates',
 'carottes',
 'vin blanc',
 "Huile d'olive",
 'carotte',
 'échalote',
 'sauce soja',
 'piment rouge',
 'poudre de cacao',
 'feuilles de laurier',
 'crème épaisse',
 'amandes moulues',
 'vin blanc sec',
 'vinaigre de vin rouge',
 'chocolat noir',
 'huile de tournesol',
 'vanille',
 'citrons',
 'clous de girofle',
 'lait entier',
 'curcuma moulu',
 'vinaigre balsamique',
 'coriandre moulue',
 'vin rouge',
 'Farine tout usage',
 'oignon rouge',
 'graines de coriandre',
 "sirop d'érable",
 'gingembre moulu',
 'anis étoilé',
 'poivre rouge',
 'Parmesan',
 'brandy',
 'concombre',
 'poivre noir',
 'miel clair',
 'vinaigre de xérès',
 'fromage à la crème',
 'coriandre fraîche hachée',
 'poivre noir fraîchement moulu',
 'gélatine',
 'crème fraîche',
 'petit oignon',
 'sel de mer',
 'garam masala',
 'gros oignon',
 'bicarbonate de soude',
 'oignon moyen',
 'cayenne pepper',
 'semblait',
 'noix de muscade',
 'paprika',
 'huile de sésame',
 'raspberries',
 'bouillon de légumes',
 'romarin frais',
 "gousses d'ail, émincées",
 'piment',
 'sucre en poudre',
 'sucre roux doux',
 "sirop d'or",
 'crème à fouetter',
 'Œufs entiers',
 'grains de poivre noir',
 'Crème épaisse',
 'tomates',
 'Moutarde anglaise',
 "gousses d'ail",
 'romarin',
 'tomates cerises',
 'raisins secs',
 'raisins secs',
 'mascarpone',
 'vinaigre de cidre',
 'curcuma',
 'graines de sésame',
 'cardamome',
 'bouillon de bœuf',
 'crème fraîche',
 'asperges',
 'mayonnaise',
 'câpres',
 'graines de fenouil',
 'huile',
 'lime juice',
 'fraises',
 'Lait entier',
 'amandes effilées',
 'sauce de poisson',
 'lait entier',
 'jus de citron frais',
 'chocolat blanc',
 'sucre demerara',
 'lard',
 "gousses d'ail, émincées",
 'piment en poudre',
 'Sel Kosher',
 'poivre',
 'eau froide',
 'Extrait de vanille',
 'gingembre',
 'coriandre fraîche',
 'pinte de crème double',
 'Poudre à lever',
 'Sauce Worcestershire',
 'Sucre brun',
 'eau bouillante',
 'les citrons verts',
 'vinaigre de riz',
 'Vanille',
 'rhubarbe',
 'origan séché',
 'pignons de pin',
 'pois surgelés',
 'ciboulette fraîche hachée',
 'paprika fumé',
 'riz basmati',
 'chapelure',
 'échalotes à la banane',
 'lait de coco',
 'Farine',
 'port',
 'sucanat',
 'huile de colza',
 'Œuf entier',
 'pommes de terre nouvelles',
 'cardamome verte',
 'épices mélangées',
 'petit oignon rouge',
 'pommes de terre',
 'de thym frais haché',
 'noix',
 'noix de muscade moulue',
 'piment vert',
 'poireaux',
 'oranges',
 'groseilles',
 'poudre de curry',
 'champignons de Paris',
 'poivre vert',
 'huile de canola',
 'jus de citron vert frais',
 'bouillon de poisson',
 'Yaourt à la grecque',
 'sucre naturel',
 "jus d'orange",
 'morceau de gingembre frais',
 'épinards',
 "huile d'olive extra vierge",
 'gousses de vanille',
 'aneth frais haché',
 'Bicarbonate de soude',
 'pois frais',
 'chapelure blanche fraîche']

In [163]:
def uncommon_ingredient(ingredient, common_ingredients):
    return ingredient not in common_ingredients

def convert_to_metric(value, unit):
    conversion_factors = {
        'tbsp': {'factor': 14.7868, 'metric_unit': 'ml'},
        'tsp': {'factor': 4.92892, 'metric_unit': 'ml'},
        'cups': {'factor': 236.588, 'metric_unit': 'ml'},
        'oz': {'factor': 28.3495, 'metric_unit': 'g'},
        'lb': {'factor': 453.592, 'metric_unit': 'g'},
        'pints': {'factor': 473.176, 'metric_unit': 'ml'},
    }
    if unit in conversion_factors:
        factor = conversion_factors[unit]['factor']
        metric_unit = conversion_factors[unit]['metric_unit']
        return value * factor, metric_unit
    else:
        return value, unit

def get_difficulty_from_nb_ingredients(nb_ingredients):
    if nb_ingredients <= 5:
        return '1'
    elif nb_ingredients <= 10:
        return '2'
    else:
        return '3'   


def read_json_file(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Check if the line is not empty
                try:
                    yield json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")

#200mb into memory, its ok
cleaned_lines = []
json_file_path = 'openrecipes_mini.json'
with open(json_file_path, 'r') as file:
    data = json.load(file)
    for i, line in enumerate(data):
        if i == 0:
            continue
        id = i
        #get raw data
        raw_ingredients = line.get('ingredients')
        recipeYield = line.get('recipeYield')
        prepTime = line.get('prepTime')
        url = line.get('url')
        image = line.get('image')
        totalTime = line.get('totalTime')
        cookTime = line.get('cookTime')
        name = line.get('name') 
        source = line.get('source', None)
        
        

        #process data
        ingredient_values, ingredient_units, ingredients = clean_ingredients(raw_ingredients)
        if any(ingredient is None for ingredient in ingredients):
            continue

        difficulty = get_difficulty_from_nb_ingredients(len(ingredients))
        prepTime = time_to_min(prepTime)
        cookTime = time_to_min(cookTime)
        totalTime = time_to_min(totalTime)
        if ((prepTime is not None) and (cookTime is not None)) and totalTime is None:
            totalTime = prepTime + cookTime
        elif (prepTime is not None) and (cookTime is None):
            totalTime = prepTime
        elif (prepTime is None) and (cookTime is not None):
            totalTime = cookTime
        if source is None:
            source = 'openrecipes'
        recipeYield = clean_yield(recipeYield)

        #convert to metric system
        for i, value in enumerate(ingredient_values):
            ingredient_values[i], ingredient_units[i] = convert_to_metric(value, ingredient_units[i])
        

        #deletion checks
        if totalTime is None:
            continue
        if int(totalTime) > 180:
            continue
        if len(ingredients) > 25:
            continue
        if recipeYield is None:
            continue
        if any(uncommon_ingredient(ingredient, common_ingredients) for ingredient in ingredients):
            continue

        #translate ingredients to french if LANG == 'fr'
        assert LANG in ['fr', 'en']
        if LANG == 'fr':
            #make the french translation by mapping the id between ingredients and translated_ingredients
            ingredients = [translated_ingredients[common_ingredients.index(ingredient)] for ingredient in ingredients]


        cleaned_lines.append({
            'id': id,
            'name': name,
            'source': source,
            'url': url,
            'image': image,
            'servings': recipeYield,
            'time': totalTime,
            'ingredients': ingredients,
            'ingredient_values': ingredient_values,
            'ingredient_units': ingredient_units
        })

#write to json
with open('cleaned_openrecipes_mini.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_lines, f, ensure_ascii=False, indent=2)

In [162]:
#number of lines in raw data
print(len(lines))
#number of lines in cleaned data
print(len(cleaned_lines)) 
print(len(cleaned_lines)/len(lines)*100, '% of recipes kept') 
#example of cleaned data
print('example of cleaned data :')
print(cleaned_lines[0])


7398
2820
38.118410381184106 % of recipes kept
example of cleaned data :
{'id': 32, 'name': 'White Russian Cocktail', 'source': 'delishhh', 'url': 'http://delishhh.com/2013/02/14/white-russian-cocktail/', 'image': 'http://delishhh.com/wp-content/uploads/2013/02/IMG_5437_small_txt.jpg', 'servings': 1, 'time': 5, 'ingredients': [], 'ingredient_values': [], 'ingredient_units': []}
