In [1]:
import pandas as pd
import numpy as np

from pathlib import Path
import re
import json

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from num2words import num2words

In [2]:
current_folder = Path().resolve()

with open(current_folder / "data" / "scraped_recipes.json", "r") as inputfile:
    raw_data = json.loads(inputfile.read()) 

In [3]:
def nutritional(nutri_list: list) -> pd.DataFrame: 
    """Create DataFrame with nutritional information"""
    
    # find index for 'Energy (kJ)'
    keywords = nutri_list[nutri_list.index("Energy (kJ)")::2]    
    # removing units from keyword string from Energy and correcting strings
    keywords_updated = [k.replace("of which ", "") if 'Energy' not in k else k.split(" ")[0] for k in keywords]
    # take nutritional values
    values = nutri_list[nutri_list.index("Energy (kJ)") + 1::2]
    
    nutri = {}
    for measure, unit in zip(keywords_updated, [val.split(" ") for val in values]):
        nutri[measure.lower() + f" ({unit[1]})"] = float(unit[0])

    return pd.DataFrame.from_dict(nutri, orient = 'index').T 

def process_sentence(sent: str) -> str:
    """Process and clean given sentence"""
    
    # 1) create tokens using the word_tokenize function from nltk
    tokens = word_tokenize(sent) 
    
    # 2) identify patterns like '6-8' (minutes) and replace by the higher number
    tokens = [re.findall("\d-\d", token)[0].split("-")[-1] 
              if len(re.findall("\d-\d", token)) != 0 
              else token 
              for token in tokens]  
    
    # 3) filter out special characters
    words = [word 
             for word in tokens 
             if word.isalnum()] 
    
    # 4) convert numbers to words
    words = [word 
             if not word.isnumeric() 
             else num2words(word) 
             for word in words] 
    
    # 5) filter out stop-words
    clean_sent = " ".join([word.lower() 
                           for word in words 
                           if not word in stop_words]) 
    
    return clean_sent 

def instructions(sent: str) -> str:
    """Process and clean recipe instructions"""
    
    # Replace and change certain characters and acronyms
    match = re.findall('\d+C', sent)
    if len(match) != 0:
        sent = sent.replace(match[0], "".join(match[0].split("C") + ["°C"]))
        
    convert_dict = {"°C":" celsius", 
                    "½":"0.5", 
                    "cm": " centimeter", 
                    "mins": "minutes", 
                    "tsp":"tablespoon"}
    
    for key in convert_dict.keys():
        sent = sent.replace(key, convert_dict[key])
    sent = " ".join(["it", sent]) # prevent tokenizer to fail by adding stop word
    
    processed_sent = process_sentence(sent)

    to_remove = re.findall("\s\w\s", processed_sent) # remove single letters if still existing
    for char in set(to_remove):
        processed_sent.replace(char, " ") 
    
    return processed_sent

In [4]:
# Process all raw recipes data 
recipes_collection_df = pd.DataFrame()
for recipe in raw_data.keys():
    if recipe == 'christmas-cheese-platter':
        continue
    recipe_dict = raw_data.get(recipe)
    
    ingred = ['Water' if 'Water' in item else item for item in recipe_dict['ingredients'][1::2]]
    # Extract preparation time
    prep_time = pd.DataFrame({recipe_dict['time'][0].lower().replace(" ","_"): re.findall("\d+", recipe_dict['time'][1])})
    # Create dataframe with nutritional information
    nutritions = nutritional(recipe_dict['nutritional'])
    recipe_df = pd.concat([nutritions, 
                           prep_time], axis=1)
    
    # Add ingredients as list
    recipe_df['ingredients'] = [list(set(ingred))]
    # Add processed instructions
    recipe_df['instructions'] = instructions(recipe_dict['instructions'])
    
    recipe_df['title'] = recipe

    recipes_collection_df = pd.concat([recipes_collection_df, 
                                       recipe_df], 
                                      axis=0)

In [5]:
recipes_collection_df.head()

Unnamed: 0,energy (kJ),energy (kcal),fat (g),saturates (g),carbohydrate (g),sugars (g),protein (g),salt (g),preparation_time,ingredients,instructions,title
0,151.0,36.0,0.5,0.1,3.0,2.0,3.0,0.0,0,[All the Greens Veg Sides],eat greens this vibrant mix traditional sliced...,all-the-greens-veg-sides
0,3663.0,875.0,41.0,14.0,81.0,31.0,46.0,2.02,40,"[Baby Spinach, Apple and Sage Jelly, Confit Du...",this apple glazed duck confit bursting full lu...,apple-glazed-duck-confit
0,3004.0,718.0,26.0,9.0,85.0,19.0,33.0,2.25,20,"[Beef Mince, Basmati Rice, Garlic Clove, Mango...",looking quick tasty midweek dinner option try ...,aromatic-beef-pilaf
0,1365.0,326.0,10.0,2.0,49.0,3.0,10.0,1.06,10,[Handcrafted Garlic Bread],our garlic bread adds authentic italian touch ...,artisan-garlic-bread
0,3392.0,811.0,26.0,14.0,102.0,25.0,39.0,4.02,35,"[Garlic Clove, Water, Aubergine, Mozzarella, R...",preheat oven two hundred celsius trim aubergin...,aubergine-parmigiana-style-pasta


In [6]:
recipes_collection_df.to_csv(current_folder / "data" / "recipe_collection.csv")