In [1]:
import json
import pandas as pd

from typing import Tuple
from collections import namedtuple
from pathlib import Path

In [2]:
current_folder = Path().resolve()

In [3]:
def manipulate_raw_data(single_recipe: list) -> Tuple[str, str, list, list, str]:
    """Extract various information from raw recipe data."""
    recipe = [x.strip() for x in single_recipe]
    
    # cut top and bottom lines
    extract = recipe[recipe.index('topBanner') + 1:recipe.index("You might also like...")]
    
    # find check items to extract relevant information
    identifiers = []
    check = ["Preparation Time","Ingredients","Nutritional information","Boxes and ingredients","Instructions"]
    for idx, val in enumerate(extract):
        if "Tag" in val:
            continue
        for item in check:
            if item in val:
                identifiers.append(idx) 
                
    # find title
    title = " ".join(extract[5:7])
    # identify prep_time
    prep_time = extract[identifiers[0]].split("Time")[1]
    # find ingredients
    ingredients = extract[identifiers[1]+3:identifiers[2]]    
    ingredients = [" ".join(ingredients[idx:idx+3:2]) for idx in range(len(ingredients)) 
               if len(ingredients[idx]) > 0 and ingredients[idx][0].isnumeric()]
    # identify nutritional information
    nutritients = extract[identifiers[2]:identifiers[3]]
    # extract instructions
    instr = extract[identifiers[4]:]
    instructions = " ".join([line for line in instr if not (line.isnumeric() or line == "PDF" or line == "Instructions")])

    recipe_info = namedtuple('recipe_info', ['title', 'prep_time', 'ingredients', 'nutritional', 'instructions'])
    
    return recipe_info(title, prep_time, ingredients, nutritients, instructions)

In [4]:
recipes = {}

path_folder = current_folder.parents[0] / 'data' / 'predict_cooking_time' / 'recipes_raw' 

# find number of recipes
path_glob_generator = Path(path_folder).glob('**/*')
files = [x for x in path_glob_generator if x.is_file()]

In [10]:
for x in range(1,len(files) + 1):
    file = "".join([r"R", str(x), ".txt"])

    with open(path_folder / file, encoding = 'utf-8') as file:
        recipe_raw = file.readlines()

    extracted_recip_info = manipulate_raw_data(recipe_raw)

    recipe = {}
    recipe['prep_time'] = extracted_recip_info.prep_time
    recipe['ingredients'] = extracted_recip_info.ingredients
    recipe['nutritional'] = extracted_recip_info.nutritional
    recipe['instructions'] = extracted_recip_info.instructions
    
    recipes[extracted_recip_info.title] = recipe

In [11]:
# Serializing json 
json_object = json.dumps(recipes, indent=4, sort_keys=True)

output_path = current_folder.parents[0] / 'data' / 'predict_cooking_time' / 'processed_data' / 'recipes.json'
  
# Writing to sample.json
with open(output_path, "w", encoding = 'utf-8') as outfile:
    outfile.write(json_object)

In [12]:
# JSON file
f = open(output_path)
  
# Reading from file
data = json.loads(f.read())