In [5]:
import json
import os
import time
from datetime import datetime
from functools import wraps
from typing import List, Dict

import requests
from bs4 import BeautifulSoup

allowed_categories = ["vorspeise", "hauptgang", "dessert", "fruehstueck", "snacks", "brote", "getraenke"]
report = []

def timeit(func):
    """
    Function, to get the runtime from different functions
    :param func:
    :return: runtime
    """

    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
        return result

    return timeit_wrapper


@timeit
def get_page_content(link: str) -> str:
    """
    Get a Parsed homepage as HTML str.
    :param link:
    :return: parsed_homepage
    """
    homepage = requests.get(link)
    if not homepage.status_code == 200:
        report.append(f"Can't reach {link} with status code 200.")
        return False
    elif homepage.status_code == 200:
        parsed_homepage = BeautifulSoup(homepage.text, 'html.parser')
        return parsed_homepage

corrupt_page = get_page_content("https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi")
main_page = get_page_content('https://storage.googleapis.com/www.selinaschoice.ch/index.html')
recipe = get_page_content("https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5")

Function get_page_content('https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi',) {} Took 2.3272 seconds
Function get_page_content('https://storage.googleapis.com/www.selinaschoice.ch/index.html',) {} Took 2.2206 seconds
Function get_page_content('https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5',) {} Took 2.3200 seconds


In [6]:
@timeit
def get_links_from_category(category: List) -> List:
    """
    Get a List recipes acording to the categories
    :param category:
    :return: recipes
    """
    recipes = []
    recipes_by_category = main_page.find_all("div", class_=category)  # a -> div
    for tag in recipes_by_category:
        recipe = tag.a.get('href', None)
        recipes.append(recipe)
    return recipes


get_links_from_category(["vorspeise", "hauptgang"])


@timeit
def get_links_to_scrape(categories=None) -> List:
    """
    Get all or some Recipes by categories. Default is None and will give you all recipies links.
    :param categories: None, str oder List
    :return: recipes: List
    """
    if categories is None:
        recipes = get_links_from_category(allowed_categories)
    else:
        recipes = get_links_from_category(categories)
    return recipes


tes = get_links_to_scrape()
len(tes)

Function get_links_from_category(['vorspeise', 'hauptgang'],) {} Took 0.0013 seconds
Function get_links_from_category(['vorspeise', 'hauptgang', 'dessert', 'fruehstueck', 'snacks', 'brote', 'getraenke'],) {} Took 0.0017 seconds
Function get_links_to_scrape() {} Took 0.0017 seconds


69

# All Link Functions that are now working with a str input

In [7]:
def get_timestamp(recipe: str) -> str:
    """
    Get Timestamp from when the recipe is created.
    :param link: str
    :return time: str
    """
    recipe.find_all('time')
    datetime_str = recipe.find('time')["datetime"]
    datetime_object = datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%SZ")
    return (str(datetime_object))


get_timestamp(recipe)

def get_recipe_title(recipe: str) -> str:
    """
    Get the Title from a recipe with a recipe link.
    :param recipe: str
    :return: title: str
    """
    title = recipe.h1.text
    return title


get_recipe_title(recipe)



def get_prep_time(recipe: str) -> str:
    """
    Get the preparation time from a recipe.
    :param recipe: str
    :return prep_time: str
    """
    try:
        prep_time = recipe.find('time', itemprop="prepTime").text
    except AttributeError as e:
        print(f"Problem with {e}")
        prep_time = recipe.find('time', itemprop="performTime").text
    return prep_time


get_prep_time(recipe)



def get_serves(recipe: str) -> str:
    """
    Get serves from recipe.
    :param link: str
    :return serves: str
    """
    serves = recipe.find_all('li')[1].text
    return serves[8:]  # without index returns "serves: 4 persons"


get_serves(recipe)



def get_difficulty(recipe: str) -> str:
    """
    Get difficulty from recipe.
    :param link: str
    :return difficulty: str
    """
    difficulty = recipe.find_all('li')[2].text
    return difficulty[12:]  # without index returns "difficulty: easy emoji"


get_difficulty(recipe)


def get_ingredients(recipe: str) -> List[Dict]:
    """
    get all ingredients from a recipe.
    :param recipe: str
    :return ingredientArray: list
    """
    ingredients = recipe.find("section", class_="ingredients")
    ingredientArray = []
    for li in ingredients.find_all('li'):
        quantity = li.find('div', class_='quantity').text
        ingredient = li.find('div', class_='name').text
        # print(quantity)
        # print(ingredient)
        ingredientValues = {
            "quantity": quantity,
            "name": ingredient,
        }
        ingredientArray.append(ingredientValues)
    return ingredientArray


get_ingredients(recipe)


def get_instructions(recipe: str) -> List[Dict]:
    """
    Get the instructions from a recipe.
    :param recipe: str
    :return instructionArray: list
    """
    instruction_array = []
    instructions = recipe.find('section', class_='instructions')
    for li in instructions.find_all('li'):
        stepTitle = li.find('div', class_='details').h3.text
        stepInfo = li.find('div', class_='text').text
        # print(stepTitle)
        # print(stepInfo)
        instructionValues = {
            "step": stepTitle,
            "instruction": stepInfo,
        }
        instruction_array.append(instructionValues)
    return instruction_array


get_instructions(recipe)

[{'step': 'Step 1',
  'instruction': 'Hähnchen und das gewünschte Gemüse, sowie Zwiebeln & Knoblauch in Stücke schneiden.'},
 {'step': 'Step 2',
  'instruction': 'In einer Bratpfanne den Butter schmelzen und die Hähnchenstücke anbraten. Nicht ganz durchbraten und dann zur Seite legen.'},
 {'step': 'Step 3',
  'instruction': 'Zwiebeln, Knoblauch und das Gemüse anbraten und anschliessend die Kokosmilch und passierte Tomaten hinzugeben.'},
 {'step': 'Step 4',
  'instruction': 'Currypulver, Salz und Pfeffer hinzugeben und das ganze ca. 10min köcheln lassen (je nach Gemüse). Anschliessend die Hähnchenstückebeigeben und nochmals ca. 5min köcheln lassen.'},
 {'step': 'Step 5',
  'instruction': 'Die Sauce abschmecken und anschliessend mit Reis oder Naan-Brote servieren und geniessen.'}]

# Functions that need the link to work with

In [8]:
@timeit
def get_category_from_recipe(link: str) -> str:
    """
    Get the one of allowed classes based on the recipe link.
    :param link: str
    :return recipe_class: str
    """
    specific_recipe = main_page.find("a", href=link).parent
    print(specific_recipe)
    specific_recipe_class = specific_recipe["class"][1:]
    recipe_class = "".join(specific_recipe_class)
    if recipe_class in allowed_categories:
        return recipe_class
    else:
        print(f"The class from recipe: {link} is {recipe_class} and not in {allowed_categories}.")


get_category_from_recipe('https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5')


@timeit
def get_recipe_img_path(link: str) -> str:
    """
    Get the img path from a recipe with the recipe link.
    :param link: str
    :return img_path: str
    """
    main_page = get_page_content('https://storage.googleapis.com/www.selinaschoice.ch/index.html#hauptgang')
    specific_recipe = main_page.find("a", href=link)
    img = specific_recipe.find("img")
    img_path = img.get("src")
    return img_path


get_recipe_img_path("https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5")

<div class="rezept hauptgang">
<a href="https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5">
<img alt="Rezeptname" src="img\hauptgang\indisches_butterchickencurry.jpg"/>
<h3 class="rezeptname">Butterchickencurry</h3>
</a>
</div>
Function get_category_from_recipe('https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5',) {} Took 0.0005 seconds
Function get_page_content('https://storage.googleapis.com/www.selinaschoice.ch/index.html#hauptgang',) {} Took 2.2005 seconds
Function get_recipe_img_path('https://shortcutapp.io/n/MGEzNzRmYzQ1MjkzYTlhMjY0YTE2NjE4MjgwMGEzOTY5',) {} Took 2.2008 seconds


'img\\hauptgang\\indisches_butterchickencurry.jpg'

# Validate Recipe

In [9]:
def validate_recipe(recipe: str):
    rapport = []
    if not get_page_content:
        rapport.append(recipe)
        return False
    else:
        return True

recipe = get_page_content("https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi")
validate_recipe(recipe)
print(report)

Function get_page_content('https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi',) {} Took 2.2759 seconds
["Can't reach https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi with status code 200.", "Can't reach https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi with status code 200."]


# Parse and Backup Recipe

In [10]:
@timeit
def get_parsed_recipe(link: str) -> Dict:
    if not get_page_content(link):
        print(f"{link} is corrupt")
    else:
        recipe = get_page_content(link)
        title = get_recipe_title(recipe)
        recipe_class = get_category_from_recipe(link)
        time_stamp = get_timestamp(recipe)
        img = get_recipe_img_path(link)
        prep_time = get_prep_time(recipe)
        serves = get_serves(recipe)
        difficulty = get_difficulty(recipe)
        ingredients = get_ingredients(recipe)
        instructions = get_instructions(recipe)

        recipe_dict = {
            "title": title,
            "recipe_class": recipe_class,
            "time": time_stamp,
            "img_path": img,
            "prep_time": prep_time,
            "serves": serves,
            "difficulty": difficulty,
            "ingredients": ingredients,
            "instructions": instructions,
        }
        return recipe_dict

In [11]:
get_parsed_recipe("https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3NmE5MDYx")

Function get_page_content('https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3NmE5MDYx',) {} Took 2.3025 seconds
Function get_page_content('https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3NmE5MDYx',) {} Took 2.2931 seconds
<div class="rezept dessert">
<a href="https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3NmE5MDYx">
<img alt="Rezeptname" src="img\dessert\schokokuchen.jpg"/>
<h3 class="rezeptname">Schokoladenkuchen</h3>
</a>
</div>
Function get_category_from_recipe('https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3NmE5MDYx',) {} Took 0.0004 seconds
Function get_page_content('https://storage.googleapis.com/www.selinaschoice.ch/index.html#hauptgang',) {} Took 2.1676 seconds
Function get_recipe_img_path('https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3NmE5MDYx',) {} Took 2.1681 seconds
Problem with 'NoneType' object has no attribute 'text'
Function get_parsed_recipe('https://shortcutapp.io/n/ZTg2YzQ0NjM4Y2VkMjFmOGZiNzM3NTY5YTU3

{'title': 'Schokoladekuchen',
 'recipe_class': 'dessert',
 'time': '2021-10-18 08:59:55',
 'img_path': 'img\\dessert\\schokokuchen.jpg',
 'prep_time': '20 minutes',
 'serves': '8 persons',
 'difficulty': ' weich',
 'ingredients': [{'quantity': '150 g', 'name': 'Butter, weich'},
  {'quantity': '300 g', 'name': 'Zucker'},
  {'quantity': '1 Packung', 'name': 'Vanillezucker'},
  {'quantity': '1 Prise', 'name': 'Salz'},
  {'quantity': '3', 'name': 'Eier'},
  {'quantity': '300 g', 'name': 'Mehl'},
  {'quantity': '½ Packung', 'name': 'Backpulver'},
  {'quantity': '100 g', 'name': 'Schokoladepulver'},
  {'quantity': '75 g', 'name': 'gemahlene Haselnüsse'},
  {'quantity': '1 ½ dl', 'name': 'Milch'},
  {'quantity': '100 g', 'name': 'Schokostückchen'}],
 'instructions': [{'step': 'Step 1',
   'instruction': 'Ofen auf 180 Grad vorheizen.Butter in eine Schüssel geben, Zucker, Vanillezucker und Salz darunterrühren. Ein Ei nach dem andern darunterrühren, weiterrühren, bis die Masse heller ist.'},
  {

In [14]:
@timeit
def save_recipe(recipe=Dict) -> json:
    filename = recipe["title"] + ".json"
    folder = recipe["recipe_class"]
    os.chdir(f"../recipes/{folder}")
    json_object = json.dumps(recipe, indent=4, ensure_ascii=False)
    with open(filename, "w", encoding="utf-8") as outfile:
        outfile.write(json_object)
    os.chdir("../../src")

In [15]:
recipe = get_parsed_recipe("https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi")
print(recipe)
save_recipe(recipe)

Function get_page_content('https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi',) {} Took 2.2807 seconds
https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi is corrupt
Function get_parsed_recipe('https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi',) {} Took 2.2810 seconds
None


TypeError: 'NoneType' object is not subscriptable

# To-Do

In [18]:
@timeit
def backup_recipe(link: str):
    if not get_page_content(link):
        print(f"{link} is corrupt")
    else:
        os.chdir("../recipes")
        dirName = get_category_from_recipe(link)
        recipe = get_parsed_recipe(link)
        try:
            # Create target Directory
            os.mkdir(dirName)
        except FileExistsError:
            print("Directory ", dirName, " already exists")
        save_recipe(recipe)

In [19]:
backup_recipe("https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh")

Function get_page_content('https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh',) {} Took 2.3163 seconds
<div class="rezept brote">
<a href="https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh">
<img alt="Rezeptname" src="img\brote\bazlama.jpg"/>
<h3 class="rezeptname">Bazlama Fladenbrot</h3>
</a>
</div>
Function get_category_from_recipe('https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh',) {} Took 0.0008 seconds
Function get_page_content('https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh',) {} Took 2.3166 seconds
Function get_page_content('https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh',) {} Took 2.2915 seconds
<div class="rezept brote">
<a href="https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh">
<img alt="Rezeptname" src="img\brote\bazlama.jpg"/>
<h3 class="rezeptname">Bazlama Fladenbrot</h3>
</a>
</div>
Function get_category_from_recipe('https://shortcutapp.io/n/Zj

In [24]:
@timeit
def backup_website(category: List):
    raw_recipes = get_links_to_scrape(category)
    print(raw_recipes)
    for recipe in raw_recipes:
        backup_recipe(recipe)

In [25]:
backup_website(["brote", "snacks"])
print(report)

Function get_links_from_category(['brote', 'snacks'],) {} Took 0.0017 seconds
Function get_links_to_scrape(['brote', 'snacks'],) {} Took 0.0020 seconds
['https://shortcutapp.io/n/YjNhNTQxOWNjNWRkNjY3NDI0NWExYjZiODllYzRiNWVk', 'https://shortcutapp.io/n/MjJmNmNjMWY5MWU2ZWQ5MDEwNWEzYzU5MzI1MmIxOGRi', 'https://shortcutapp.io/n/ODBhYzMyYmVkOTJjOWE2NjNjOTgxNjBkMTViMzE2NjFh', 'https://shortcutapp.io/n/ZDc4MGIyOGJkMzJlY2RkY2Y1MTFjNGJjZjhkYTViZDM1', 'https://shortcutapp.io/n/ODQyODU2NzE4YjVkZDFkN2FiNTc2ODAzODI4YTRjNzBh', 'https://shortcutapp.io/n/NDJlNzE0NGIyMTUwYzcwYjliMzcxZTEyYWMyZTk2YzE4', 'https://shortcutapp.io/n/ZjY0YThmNGY2ZGZhNGUyMTU3NTMyNWFjZGUyNzA3OTBh', 'https://shortcutapp.io/n/NDc0ZTFhZGYzMTBiOWRmY2U2NTkyNjU4ZWMyNzY4ODQ2', 'https://shortcutapp.io/n/MzdhOWE0OWMwYTgwZjUxYjZmOWRjZGMxYjAzYzNhYzMw']
Function get_page_content('https://shortcutapp.io/n/YjNhNTQxOWNjNWRkNjY3NDI0NWExYjZiODllYzRiNWVk',) {} Took 2.3318 seconds
<div class="rezept snacks">
<a href="https://shortcutapp.io/n/YjNhN

ProxyError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /www.selinaschoice.ch/index.html (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 401 Proxy server authentication failed: dmz-gate.zrh.local:8080')))


ToDo:
- Automate creation from recipe folder
- make Function if the recipe link fails like the one from Erdbeer Clafoutis - https://shortcutapp.io/n/YWI5NWY5MjhhNzUzNTIwYTExNGE5YTdkN2U4ZTBhOGJi
- make sure dataquality is good with None Vlaue handling.
- Move package out to create Tests

In [9]:
# Test Zone
allowed_categories = ["vorspeise", "hauptgang", "dessert", "fruehstueck", "snacks", "brote", "getraenke"]

selected_categories = ["vorspeise", "hauptgang", "dessert", "fruehstueck", "snacks", "brote", "getraenke"]

if set(selected_categories).issubset(set(allowed_categories)):
    print("it works s")
else:
    print("No work")

it works s
