In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import re
import json

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from num2words import num2words

In [2]:
current_folder = Path().resolve()

with open(current_folder / "scraped_recipes.json", "r") as inputfile:
    raw_data = json.loads(inputfile.read()) 

In [3]:
def nutritional(nutri_list: list) -> pd.DataFrame: 
    """Create DataFrame with nutritional information"""
    
    # find index for 'Energy (kJ)'
    keywords = nutri_list[nutri_list.index("Energy (kJ)")::2]    
    # removing units from keyword string from Energy and correcting strings
    keywords_updated = [k.replace(" of which", "") if 'Energy' not in k else k.split(" ")[0] for k in keywords]
    # take nutritional values
    values = nutri_list[nutri_list.index("Energy (kJ)") + 1::2]
    
    nutri = {}
    for measure, unit in zip(keywords_updated, [val.split(" ") for val in values]):
        nutri[measure.lower() + f" ({unit[1]})"] = float(unit[0])

    return pd.DataFrame.from_dict(nutri, orient = 'index').T 

def process_sentence(sent: str) -> str:
    """Process and clean given sentence.
    
    The following steps are taken:
    1) create tokens using the word_tokenize function from nltk,
    2) identify patterns like '6-8' and replace by the higher number
    3) filter out special characters,
    4) convert numbers to words,
    5) filter out stop-words."""
    
    tokens = word_tokenize(sent) # 1)

    tokens = [re.findall("\d-\d", token)[0].split("-")[-1] 
              if len(re.findall("\d-\d", token)) != 0 
              else token 
              for token in tokens]  # 2)
    
    words = [word 
             for word in tokens 
             if word.isalnum()] # 3)
    
    words = [word 
             if not word.isnumeric() 
             else num2words(word) 
             for word in words] # 4)
    
    clean_sent = " ".join([word.lower() 
                           for word in words 
                           if not word in stop_words]) # 5)
    
    return clean_sent 

def instructions(sent: str) -> str:
    """Process and clean given sentence"""
    
    match = re.findall('\d+C', sent)
    if len(match) != 0:
        sent = sent.replace(match[0], "".join(match[0].split("C") + ["°C"]))
        
    convert_dict = {"°C":" celsius", 
                    "½":"0.5", 
                    "cm": " centimeter", 
                    "mins": "minutes", 
                    "tsp":"tablespoon"}
    
    for key in convert_dict.keys():
        sent = sent.replace(key, convert_dict[key])
    sent = " ".join(["it", sent]) # prevent tokenizer to fail by adding stop word
    
    processed_sent = process_sentence(sent)

    to_remove = re.findall("\s\w\s", processed_sent) # remove single letters if still existing
    for char in set(to_remove):
        processed_sent.replace(char, " ") 
    
    return processed_sent

In [4]:
recipes_collection_df = pd.DataFrame()
for recipe in raw_data.keys():
    recipe_dict = raw_data.get(recipe)

    ingred = ['Water' if 'Water' in item else item for item in recipe_dict['ingredients'][1::2]]
    prep_time = pd.DataFrame({recipe_dict['time'][0].lower().replace(" ","_"): re.findall("\d+", recipe_dict['time'][1])})
    nutritions = nutritional(recipe_dict['nutritional'])

    recipe_df = pd.concat([nutritions, 
                           prep_time], 
                          axis=1)
    
    recipe_df['ingredients'] = [list(set(ingred))]

    recipe_df['instructions'] = instructions(recipe_dict['instructions'])

    recipes_collection_df = pd.concat([recipes_collection_df, 
                                       recipe_df], 
                                      axis=0)

In [5]:
recipes_collection_df.head()

Unnamed: 0,energy (kJ),energy (kcal),fat (g),of which saturates (g),carbohydrate (g),of which sugars (g),protein (g),salt (g),preparation_time,ingredients,instructions
0,4609.0,1102.0,70.0,36.0,75.0,17.0,43.0,2.73,30,"[Chicken Stock Paste, Tomato Puree, Mature Che...",these cheesy beef empanada pockets crowd pleas...
0,3383.0,809.0,38.0,16.0,76.0,5.0,52.0,1.6,20,"[Chicken Stock Paste, Panko Breadcrumbs, Slice...",looking quick tasty midweek dinner option try ...
0,2978.0,712.0,34.0,19.0,60.0,17.0,38.0,4.15,30,"[Fresh Chilli Jam, Flat Leaf Parsley, Sun-Drie...",our chilli jam glazed halloumi delicious veggi...
0,1906.0,456.0,21.0,4.0,43.0,4.0,23.0,0.7,30,"[Flat Leaf Parsley, Black Olives, Potatoes, Gr...",oh salsa verde cheeky little green sauce makes...
0,3437.0,821.0,37.0,19.0,82.0,17.0,40.0,4.21,40,"[Aubergine, Onion Marmalade, Potatoes, Halloum...",our halloumi aubergine burger delicious veggie...


In [6]:
recipes_collection_df.to_csv("recipe_collection.csv")