In [12]:
import json
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import re
from num2words import num2words

In [17]:
def nutritional(nutri_list: list) -> pd.DataFrame:
    # find index for 'Energy (kJ)'
    keywords = nutri_list[nutri_list.index("Energy (kJ)")::2]
    # removing units from keyword string from Energy
    keyword_updated = [
        k if 'Energy' not in k else k.split(" ")[0] for k in keywords
    ]
    # removing 'of which '
    keyword_updated = [
        k.split(" ")[-1] if len(k.split(" ")) else k for k in keyword_updated
    ]
    # take nutritional values
    values = nutri_list[nutri_list.index("Energy (kJ)") + 1::2]

    nutri = {}
    for a, b in zip(keyword_updated, [val.split(" ") for val in values]):
        if b[0] == 'null':
            nutri[a + f" ({b[1]})"] = float('0')
        else:
            nutri[a + f" ({b[1]})"] = float(b[0])

    return pd.DataFrame.from_dict(nutri, orient='index').T


def manipulate(s: str) -> str:
    tmp = s.split(" ")
    new_s = []
    for p in tmp:
        if p == '½':
            new_s.append('0.5')
        elif p in stop_words:
            pass
        elif p.isnumeric() and tmp.index(p) > 0:
            new_s.append(num2words(p))
        else:
            new_s.append(p)
    if 'Water' in new_s:
        return " ".join(new_s[:new_s.index('Water') + 1])
    else:
        return " ".join(new_s)


def ingredients(l: list) -> list:
    tmp_list = [manipulate(item) for item in l]

    return [x.split(" ", 2)[-1] for x in tmp_list]


def process_sentence(s: str) -> str:
    """This function takes a sentence as input and performs the following steps:
    1) create tokens using the word_tokenize function from nltk,
    2) identify patterns like '6-8' and replace by the higher number
    3) filter out special characters,
    4) convert numbers to words,
    5) filter out stop-words."""

    tokens = word_tokenize(s)
    tokens = [
        re.findall("\d-\d", token)[0].split("-")[-1]
        if len(re.findall("\d-\d", token)) != 0 else token for token in tokens
    ]
    words = [word for word in tokens if word.isalnum()]
    words = [
        word if not word.isnumeric() else num2words(word) for word in words
    ]
    clean_sent = " ".join(
        [word.lower() for word in words if not word in stop_words])
    return clean_sent


def instructions(s: str) -> str:
    match = re.findall('\d+C', s)
    if len(match) != 0:
        s = s.replace(match[0], "".join(match[0].split("C") + ["°C"]))
    convert_dict = {
        "°C": " celsius",
        "½": "0.5",
        "cm": " centimeter",
        "mins": "minutes",
        "tsp": "tablespoon"
    }

    for key in convert_dict.keys():
        s = s.replace(key, convert_dict[key])
    return process_sentence(s)

In [18]:
def preprocess(orig_dict: dict) -> dict:
    d = orig_dict.copy()
    del d['difficulty']
    del d['description']

    d['prep_time'] = int(d['prep_time'].split(" ")[0])

    # Nutritional info
    # keywords start from position 3, every second entry
    d['nutritional'] = nutritional(d['nutritional'])

    # Ingredients
    d['ingredients'] = ingredients(d['ingredients'])

    # Instructions
    d['instructions'] = instructions(d['instructions'])

    return d


def correct_instr(x):
    final_recip = " ".join(
        [string for string in x.split() if len(string) != 1])
    return final_recip

In [19]:
# JSON file
f = open("hello_fresh_recipes.json", "r")
  
# Reading from file
data = json.loads(f.read())

In [20]:
final_recipes = pd.DataFrame()
for idx, dish in enumerate(data.keys()):
    tmp = data[dish].copy()
    tmp_processed = preprocess(tmp)
    recipe = tmp_processed['nutritional'].copy()
    recipe['ingredients'] = [tmp_processed['ingredients']]
    recipe['prep_time'] = tmp_processed['prep_time']
    recipe['instructions'] = tmp_processed['instructions']
    final_recipes = pd.concat([final_recipes, recipe])
final_recipes = final_recipes.reset_index(drop=True)

In [21]:
final_recipes['instructions'] = final_recipes['instructions'].apply(lambda x: correct_instr(x))

In [26]:
final_recipes.head()

Unnamed: 0,Energy (kJ),Energy (kcal),Fat (g),saturates (g),Carbohydrate (g),sugars (g),Fiber (g),Protein (g),Cholesterol (mg),Salt (g),ingredients,prep_time,instructions
0,2702.0,646.0,19.58,7.73,84.78,16.94,3.18,34.15,0.0,2.03,"[Basmati Rice, Green Beans, Bell Pepper, Lime,...",30,pour water rice see ingredients amount saucepa...
1,2475.0,592.0,13.0,3.0,81.0,15.0,1.0,41.0,0.0,2.45,"[Basmati Rice, Bok Choy, Salted Peanuts, Coria...",20,cook rice bring large saucepan water boil tabl...
2,2431.0,581.0,13.0,6.63,92.57,13.75,0.34,22.43,0.0,2.82,"[Diced Butternut Squash, Onion, Flat Leaf Pars...",40,preheat oven two hundred celsius put diced but...
3,2289.0,547.0,18.0,10.0,68.0,6.0,0.0,24.0,0.0,2.19,"[Echalion Shallot, Garlic Clove, Chestnut Mush...",30,put large saucepan water tablespoon salt boil ...
4,2527.0,604.0,17.0,7.0,82.0,12.0,0.0,30.0,0.0,2.41,"[Onion, Leek, Garlic Clove, Kale, Bacon Lardon...",30,preheat oven two hundred celsius halve peel ch...


In [307]:
final_recipes.to_excel("Recipes.xlsx")