---
format: 
  html:
    embed-resources: true
execute:
  echo: true
code-fold: true
title: Data is Delicious
author: James Compagno
jupyter: python3
---

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import time
import plotnine as p9
import textwrap
from calendar import day_name



# 1. Data from unstructured websites

This website contains many weekly meal plans. Choose one that seems delicious to you. Scrape the weekly meal plan into a table with the following columns:

Day of the Week
Name of Recipe
Link to Recipe
Price of Recipe

In [2]:
#What the HTML looks like

# <div class="wp-block-group"><div class="wp-block-group__inner-container is-layout-flow wp-block-group-is-layout-flow">
# <p class="has-text-align-left"><strong>Monday</strong>: <a href="https://tastesbetterfromscratch.com/favorite-baked-salmon-with-dill/">Baked Salmon</a> $14.87</p>

# <p class="has-text-align-left"><strong>Tuesday</strong>: <a href="https://tastesbetterfromscratch.com/classic-homemade-chili/">Classic Chili</a> $21.49</p>

# <p class="has-text-align-left"><strong>Wednesday: </strong><a href="https://tastesbetterfromscratch.com/calzones/">Calzone</a> $5.63</p>

# <p class="has-text-align-left"><strong>Thursday:</strong> <a href="https://tastesbetterfromscratch.com/chickpea-curry/">Chickpea Curry</a> $14.89</p>

# <p class="has-text-align-left"><strong>Friday:</strong> <a href="https://tastesbetterfromscratch.com/mexican-pizzas/">Mexican Pizza</a> $17.51</p>
# </div></div>

In [3]:
URL = "https://tastesbetterfromscratch.com/meal-plan-169/"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

response = requests.get(URL, headers=HEADERS, timeout=30)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

day_pattern = re.compile(r"^(\w+)\s*:", re.IGNORECASE)
price_pattern = re.compile(r"\$\d+(?:\.\d{2})?")
rows = []

for element in soup.find_all(["p", "li"]):
    text = " ".join(element.get_text(" ", strip=True).split())

    day_match = day_pattern.search(text)
    if not day_match:
        continue
    day = day_match.group(1).title()

    a = element.find("a")
    if not a:
        continue

    recipe_name = a.get_text(strip=True)
    recipe_url = a.get("href") or None

    price_match = price_pattern.search(text)
    price = price_match.group(0) if price_match else None

    rows.append({
        "Day of the Week": day,
        "Name of Recipe": recipe_name,
        "Link to Recipe": recipe_url,
        "Price of Recipe": price
    })

# Sort rows by weekday order and create DataFrame
day_names = list(day_name)  # ['Monday', 'Tuesday', ...]
order = {d: i for i, d in enumerate(day_names)}
rows = sorted(rows, key=lambda r: order.get(r["Day of the Week"], 99))

df_plan = pd.DataFrame(rows, columns=["Day of the Week", "Name of Recipe", "Link to Recipe", "Price of Recipe"])
df_plan


Unnamed: 0,Day of the Week,Name of Recipe,Link to Recipe,Price of Recipe
0,Monday,Baked Salmon,https://tastesbetterfromscratch.com/favorite-b...,$14.87
1,Tuesday,Classic Chili,https://tastesbetterfromscratch.com/classic-ho...,$21.49
2,Wednesday,Calzone,https://tastesbetterfromscratch.com/calzones/,$5.63
3,Thursday,Chickpea Curry,https://tastesbetterfromscratch.com/chickpea-c...,$14.89
4,Friday,Mexican Pizza,https://tastesbetterfromscratch.com/mexican-pi...,$17.51


# 2. Data from an API

Using the Tasty API from the practice activity, search for recipes that match the “Monday” recipe in your meal plan. Compile a table of all these recipes.

In [4]:
# url = "https://tasty.p.rapidapi.com/recipes/list"

# querystring = {"from":"0","size":"20","q":"daikon"}

# headers = {
#     "X-RapidAPI-Key": "8e7b4eb66emsh05b4af84d6f86e3p163d2ejsn7e5cbfcbda11",
#     "X-RapidAPI-Host": "tasty.p.rapidapi.com"
# }

# response = requests.get(url, headers=headers, params=querystring)

# print(response.json())

In [5]:
# API
api_key = "8e7b4eb66emsh05b4af84d6f86e3p163d2ejsn7e5cbfcbda11"

def fetch_recipes(search_term, result_limit=100):
    urltasty = "https://tasty.p.rapidapi.com/recipes/list"
    request_headers = {
        "X-RapidAPI-Key": api_key,
        "X-RapidAPI-Host": "tasty.p.rapidapi.com"}

    params = {"from": "0", "size": str(min(result_limit, 100)), "q": search_term}
    response = requests.get(urltasty, headers=request_headers, params=params, timeout=30)
    return response

def sanitize_search_query(text: str) -> str:
    """Remove special characters and normalize whitespace."""
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

# Get Monday's recipe from meal plan
monday_meals = df_plan[df_plan["Day of the Week"].str.lower() == "monday"]
if monday_meals.empty:
    print("No Monday meals found in df_plan")
    raise SystemExit
target_recipe = monday_meals.iloc[0]["Name of Recipe"]

# Prepare search queries with varying specificity
exact_query = target_recipe
sanitized_query = sanitize_search_query(target_recipe)
significant_words = [word for word in sanitized_query.split() if len(word) > 3]
broad_query = " ".join(significant_words) if significant_words else sanitized_query

query_list = [exact_query, sanitized_query]
if broad_query not in query_list:
    query_list.append(broad_query)

# Fetch and parse recipe data
recipe_data = []
for search_query in query_list:
    response = fetch_recipes(search_query, result_limit=100)
    recipe_results = response.json().get("results", [])
    
    for recipe in recipe_results:
        nutritional_info = recipe.get("nutrition") or {}
        user_ratings = recipe.get("user_ratings") or {}
        recipe_slug = recipe.get("slug")
        
        recipe_data.append({
            "QueryUsed": search_query,
            "Day of the Week": "Monday",
            "Mealplan Recipe": target_recipe,
            "Tasty Recipe Name": recipe.get("name"),
            "Tasty URL": f"https://tasty.co/recipe/{recipe_slug}" if recipe_slug else None,
            "Calories": nutritional_info.get("calories"),
            "Protein_g": nutritional_info.get("protein"),
            "Fat_g": nutritional_info.get("fat"),
            "Carbs_g": nutritional_info.get("carbohydrates"),
            "Sugar_g": nutritional_info.get("sugar"),
            "Fiber_g": nutritional_info.get("fiber"),
            "UserRatingScore": (user_ratings.get("score") if isinstance(user_ratings, dict) else None)
        })
    
    if recipe_data: 
        break

SSLError: HTTPSConnectionPool(host='tasty.p.rapidapi.com', port=443): Max retries exceeded with url: /recipes/list?from=0&size=100&q=Baked+Salmon (Caused by SSLError(CertificateError("hostname 'tasty.p.rapidapi.com' doesn't match 'ws-test.dev.edi-pc.nextgenaws.net'")))

In [None]:
monday_recpies = pd.DataFrame(recipe_data)
print("Recipes for Monday")
print(monday_recpies.head(10))

# 3. Automate it

Write a function called get_mealplan_data that performs 2 and 3 above automatically. That is, your function should:

Take as input a number 100-210, representing which weekly meal plan you are referencing.

Scrape the meal plan from the meal planning site.

Query the Tasty API for recipes matching each of the ones in the chosen weekly meal plan.

Output a single dataset, which contains all the information from the above

Hint: You may have an easier time if you write two smaller functions, get_weekly_plan and match_recipe, and then you use them inside your main function.

Run the following code, which should work if your function is complete:

df = get_mealplan_data(202)

In [6]:
def get_weekly_plan(plan_number):
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    URL = f"https://tastesbetterfromscratch.com/meal-plan-{plan_number}/"

    try:
        response = requests.get(URL, headers=HEADERS, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching meal plan: {e}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.text, "html.parser")

    day_pattern = re.compile(r"^(\w+)\s*:", re.IGNORECASE)
    price_pattern = re.compile(r"\$\d+(?:\.\d{2})?")  
    rows = []

    for element in soup.find_all(["p", "li"]):
        text = " ".join(element.get_text(" ", strip=True).split())

        day_match = day_pattern.search(text)
        if not day_match:
            continue
        day = day_match.group(1).title()

        a = element.find("a")
        if not a:
            continue
        recipe_name = a.get_text(strip=True)
        recipe_url = a.get("href") or None

        price_match = price_pattern.search(text)
        price = price_match.group(0) if price_match else None

        rows.append({
            "Day of the Week": day,
            "Name of Recipe": recipe_name,
            "Link to Recipe": recipe_url,
            "Price of Recipe": price
        })
    return pd.DataFrame(rows)

In [7]:
get_weekly_plan(169)

Unnamed: 0,Day of the Week,Name of Recipe,Link to Recipe,Price of Recipe
0,Monday,Baked Salmon,https://tastesbetterfromscratch.com/favorite-b...,$14.87
1,Tuesday,Classic Chili,https://tastesbetterfromscratch.com/classic-ho...,$21.49
2,Wednesday,Calzone,https://tastesbetterfromscratch.com/calzones/,$5.63
3,Thursday,Chickpea Curry,https://tastesbetterfromscratch.com/chickpea-c...,$14.89
4,Friday,Mexican Pizza,https://tastesbetterfromscratch.com/mexican-pi...,$17.51


In [None]:
#def match_recipe

In [None]:
#def get_mealplan_data

In [None]:
#df = get_mealplan_data(202)

# 4. Add a column with fuzzy matching

Add a column to your df dataset indicating whether the recipe in that row is vegetarian or not.

You may assume, for our purposes, that all recipes containing meat will have the name of a common meat in the recipe title. (Of course, that is not universally true - but we’ll assume it is for now.)

# 6. Analyze
Make a visualization that tells a story about nutrition information (available in the Tasty API results) across the week for Mealplan 202. Your visualization should also indicate which meals are vegetarian.