In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin  # Import urljoin function for constructing URLs correctly

base_url = "https://www.bbcgoodfood.com/recipes/category/cuisine-collections?page=1"

# Function to check if a link is a cuisine collection link
def is_cuisine_link(link):
    return '/recipes/collection/' in link['href']

# Function to scrape cuisine collection URLs from the main page
def scrape_cuisine_collections(url):
    response = requests.get(url)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        
        cuisine_collections = []
        cuisine_elements = soup.find_all('a', class_='d-block')
        
        for cuisine in cuisine_elements:
            if is_cuisine_link(cuisine):
                cuisine_title = cuisine.text.strip()
                cuisine_url = urljoin(base_url, cuisine['href'])
                
                cuisine_collections.append({
                    'Title': cuisine_title,
                    'URL': cuisine_url
                })
        
        return cuisine_collections
    else:
        print(f"Failed to retrieve page: {response.status_code}")
        return []

# Function to scrape recipe data from a single cuisine collection page
def scrape_recipes_from_cuisine(url):
    response = requests.get(url)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        
        recipes = []
        recipe_elements = soup.find_all('a', class_='d-block')
        
        for recipe in recipe_elements:
            recipe_title = recipe.text.strip()
            recipe_path = recipe['href']
            recipe_url = urljoin(base_url, recipe_path)
            
            # Now scrape ingredients and description for each recipe
            ingredients, description = scrape_recipe_details(recipe_url)
            
            recipes.append({
                'Title': recipe_title,
                'URL': recipe_url,
                'Ingredients': ingredients,
                'Description': description
            })
        
        return recipes
    else:
        print(f"Failed to retrieve page: {response.status_code}")
        return []

# Function to scrape ingredients and description from a single recipe page
def scrape_recipe_details(url):
    response = requests.get(url)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        
        # Scrape ingredients
        ingredients = []
        ingredients_section = soup.find('section', class_='recipe__ingredients')
        if ingredients_section:
            ingredient_lists = ingredients_section.find_all('ul', class_='list')
            for ingredient_list in ingredient_lists:
                items = ingredient_list.find_all('li', class_='list-item')
                for item in items:
                    ingredient = item.text.strip()
                    ingredients.append(ingredient)
        
        # Scrape description
        description = ''
        description_section = soup.find('section', class_='recipe-method__wrapper')
        if description_section:
            paragraphs = description_section.find_all('p')
            description = ' '.join([p.text.strip() for p in paragraphs])
        
        return ingredients, description
    else:
        print(f"Failed to retrieve recipe page: {response.status_code}")
        return [], ''

# Function to write recipes to CSV
def write_to_csv(recipes, filename):
    fields = ['Title', 'URL', 'Ingredients', 'Description']
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        writer.writerows(recipes)
    print(f"Saved {len(recipes)} recipes to {filename}")

# Main function to scrape multiple pages and write to CSV
def main():
    all_cuisine_collections = []
    for page_num in range(1, 2):  # scraping pages 1 to 5 for example
        url = f"https://www.bbcgoodfood.com/recipes/category/cuisine-collections?page={page_num}"
        print(f"Scraping cuisine collections from {url}")
        cuisine_collections = scrape_cuisine_collections(url)
        all_cuisine_collections.extend(cuisine_collections)
    
    all_recipes = []
    for cuisine in all_cuisine_collections:
        cuisine_url = cuisine['URL']
        print(f"Scraping recipes from cuisine collection: {cuisine['Title']}")
        recipes = scrape_recipes_from_cuisine(cuisine_url)
        all_recipes.extend(recipes)
        time.sleep(1)  # Adding a delay of 1 second between requests to be polite to the server
    
    if all_recipes:
        write_to_csv(all_recipes, 'food2.csv')
    else:
        print("No recipes scraped. Check your scraping logic.")

if __name__ == "__main__":
    main()


Scraping cuisine collections from https://www.bbcgoodfood.com/recipes/category/cuisine-collections?page=1
Scraping recipes from cuisine collection: 
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: American
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: 
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: British
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: 
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: Caribbean
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: 
Failed to retrieve recipe page: 403
Failed to retrieve recipe page: 403
Scraping recipes from cuisine collection: Jerk chicken
Failed to 