### Web Scraping - "pinchofyum"

In [5]:
!pip install selenium
!pip install webdriver-manager



In [None]:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
import logging
import traceback
import os

BASE_URL = 'https://pinchofyum.com'
RECIPE_LIST_URL = f'{BASE_URL}/recipes/all'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}

cur_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
log_file_name = f"scraper_{cur_time}.log"
logging.basicConfig(level=logging.INFO, filename=log_file_name, filemode='w', format='%(asctime)s - %(message)s')

def get_number_of_pages():
    """
    Fetches the total number of pages from the recipe list URL.

    :return:
        int: Total number of pages.
    """
    response = requests.get(RECIPE_LIST_URL, headers=HEADERS)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the span with class 'page-numbers dots'
    dots_span = soup.find('span', class_='page-numbers dots')
    if dots_span:
        # Find the next 'a' sibling after the dots span
        next_page_link = dots_span.find_next('a', class_='page-numbers')
        if next_page_link:
            total_pages = int(next_page_link.get_text(strip=True))
            return total_pages

    return 1  # Default to 1 if no pagination is found

def get_recipe_links(total_pages):
    """
    Fetches all the recipe links from the recipe list URL across all pages.

    :param total_pages: Total number of pages.
    :return:
        List[str]: List of recipe URLs.
    """
    recipe_links = []
    for page in range(1, total_pages + 1):
        # Sleep to be polite and prevent IP blocking
        time.sleep(5)
        page_url = f"{RECIPE_LIST_URL}/page/{page}"
        print(f"Fetching page URL: {page_url}")
        page_response = requests.get(page_url, headers=HEADERS)
        if page_response.status_code != 200:
            print(f"Failed to fetch page {page} (status code: {page_response.status_code})")
            continue

        page_soup = BeautifulSoup(page_response.content, "html.parser")
        articles_container = page_soup.find("div", class_="grid grid-cols-12 gap-4")
        
        # If the structure changes or doesn't exist, skip
        if not articles_container:
            print(f"No recipes found on page {page}")
            continue

        articles = articles_container.find_all("article")
        for article in articles:
            link_tag = article.find("a")
            if link_tag and link_tag.get("href"):
                recipe_links.append(link_tag["href"])

    return recipe_links

def get_recipes(recipe_url):
    """
    Fetches recipe details from an individual recipe page.

    :param recipe_url: URL of the recipe to scrape.
    :return:
        dict or None: Dictionary of recipe details if successful, else None.
    """
    try:
        print(f"Getting recipe details from {recipe_url}")
        recipe_response = requests.get(recipe_url, headers=HEADERS)
        if recipe_response.status_code != 200:
            print(f"Failed to fetch recipe URL (status code: {recipe_response.status_code})")
            return None
        
        recipe_soup = BeautifulSoup(recipe_response.content, "html.parser")
        recipe_div = recipe_soup.find("div", class_="tasty-recipes")
        
        # If the specific recipe container doesn't exist, skip
        if not recipe_div:
            print(f"No 'tasty-recipes' div found at {recipe_url}")
            return None
        
        # Extract fields
        recipe_img_tag = recipe_div.find("img", class_="attachment-thumbnail size-thumbnail")
        recipe_img = recipe_img_tag["src"] if recipe_img_tag else ""

        title_tag = recipe_div.find("h2", class_="tasty-recipes-title")
        recipe_title = title_tag.get_text(strip=True) if title_tag else "No Title"

        total_time_tag = recipe_div.find("span", class_="tasty-recipes-total-time")
        recipe_total_time = total_time_tag.get_text(strip=True) if total_time_tag else "Not specified"

        # Description
        try:
            desc_body = recipe_div.find("div", class_="tasty-recipes-description-body")
            recipe_description = desc_body.find("p").get_text(strip=True)
        except:
            recipe_description = ""

        # Ingredients
        ingredients = []
        recipe_ingredients_header = recipe_div.find("div", class_="tasty-recipes-ingredients-header")
        if recipe_ingredients_header:
            recipe_ingredients = recipe_ingredients_header.find_next_sibling("div")
            if recipe_ingredients:
                li_tags = recipe_ingredients.find_all("li")
                for li_tag in li_tags:
                    # The code assumes each `li` has an `input` with aria-label
                    input_tag = li_tag.find("input")
                    if input_tag and input_tag.get("aria-label"):
                        ingredients.append(input_tag["aria-label"])

        # Instructions
        instructions = []
        instructions_header = recipe_div.find("div", class_="tasty-recipes-instructions-header")
        if instructions_header:
            instructions_div = instructions_header.find_next_sibling("div")
            if instructions_div:
                instruction_items = instructions_div.find_all("li")
                for item in instruction_items:
                    instructions.append(item.get_text(strip=True))

        recipe = {
            'image': recipe_img,
            'title': recipe_title,
            'description': recipe_description,
            'total_time': recipe_total_time,
            'ingredients': ingredients,
            'instructions': instructions
        }
        return recipe

    except Exception as e:
        traceback_info = traceback.format_exc()
        print(f"Exception {e} occurred while scraping {recipe_url}")
        print(f"Traceback info: {traceback_info}")
        return None

def scrape_recipes():
    """
    Scrapes recipes from the website and saves to CSV.
    """
    total_pages = get_number_of_pages()
    print(f'Total number of pages to scrape: {total_pages}')

    # Get links of all recipes
    recipe_urls = get_recipe_links(total_pages)

    for recipe_url in recipe_urls:
        logging.info(f"Started scraping {recipe_url}")
        # Sleep to avoid too-frequent requests
        time.sleep(120)
        recipe_data = get_recipes(recipe_url)
        if recipe_data:
            print(f"Successfully scraped {recipe_url}")
            logging.info(f"Completed scraping {recipe_url}")
            save_recipe_to_csv(recipe_data)
        else:
            logging.error(f"Failed to scrape {recipe_url}")

def save_recipe_to_csv(recipe, filename='recipes.csv'):
    """
    Saves a single recipe to a CSV file. Appends if file exists, otherwise creates it.
    """
    df = pd.DataFrame([recipe])
    try:
        file_exists = os.path.isfile(filename)
        df.to_csv(filename, mode='a', header=not file_exists, index=False)
        print(f"Recipe saved to {filename}")
    except PermissionError:
        logging.error(f"Permission denied for file {filename}")
    except OSError as e:
        logging.error(f"Error: {e}")
    except Exception as e:
        logging.error(f"Unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_recipes()


Total number of pages to scrape: 107
Fetching page URL: https://pinchofyum.com/recipes/all/page/1
Fetching page URL: https://pinchofyum.com/recipes/all/page/2
Fetching page URL: https://pinchofyum.com/recipes/all/page/3
Fetching page URL: https://pinchofyum.com/recipes/all/page/4
Fetching page URL: https://pinchofyum.com/recipes/all/page/5
Fetching page URL: https://pinchofyum.com/recipes/all/page/6
Fetching page URL: https://pinchofyum.com/recipes/all/page/7
Fetching page URL: https://pinchofyum.com/recipes/all/page/8
Fetching page URL: https://pinchofyum.com/recipes/all/page/9
Fetching page URL: https://pinchofyum.com/recipes/all/page/10
Fetching page URL: https://pinchofyum.com/recipes/all/page/11
Fetching page URL: https://pinchofyum.com/recipes/all/page/12
Fetching page URL: https://pinchofyum.com/recipes/all/page/13
Fetching page URL: https://pinchofyum.com/recipes/all/page/14
Fetching page URL: https://pinchofyum.com/recipes/all/page/15
Fetching page URL: https://pinchofyum.com/