In [14]:
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

Successfully fetched the sitemap index!


In [31]:
def fetch_sitemap(url):
    """ Fetch the sitemap content from a URL """
    try:
        # Send a GET request to the base page
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_sitemap(sitemap_xml):
    """ Parse the XML sitemap to extract all URLs """
    urls = []
    try:
        root = ET.fromstring(sitemap_xml)
        
        # Define the namespace for handling the default namespace
        namespaces = {
            '': 'http://www.sitemaps.org/schemas/sitemap/0.9'  # Default namespace
        }

        # Find all <url> elements in the sitemap
        for url in root.findall('.//url', namespaces):
            loc = url.find('loc', namespaces)
            if loc is not None:
                # Extract the URL from the <loc> tag
                urls.append(loc.text)

    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    
    return urls

def parse_sitemap_index(sitemap_xml):
    """ Parse the XML sitemap index to extract URLs of individual sitemaps """
    urls = []
    try:
        # Parse the sitemap index XML
        root = ET.fromstring(sitemap_xml)
        
        # Define the namespace to search for elements with the correct namespace
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # Find all <loc> elements under <sitemap> tags
        for sitemap in root.findall('.//sitemap:sitemap/sitemap:loc', namespace):
            urls.append(sitemap.text)
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    
    return urls

def save_urls(urls, filename):
    """ Save the URLs to a file """
    with open(filename, 'w') as f:
        for url in urls:
            f.write(url + '\n')

In [48]:
# Fetch the sitemap index
sitemap_url = "https://preppykitchen.com/sitemap_index.xml"


sitemap_xml = fetch_sitemap(sitemap_url)


if sitemap_xml:
        # Parse the sitemap index to get individual sitemap URLs
        print("Parsing sitemap index...")
        sitemaps = parse_sitemap_index(sitemap_xml)

        all_urls = []
        for sitemap_url in sitemaps:
            print(f"Fetching {sitemap_url}...")
            sitemap_xml = fetch_sitemap(sitemap_url)
            if sitemap_xml:
                print(f"Parsing {sitemap_url}...")
                recipe_urls = parse_sitemap(sitemap_xml)
                all_urls.extend(recipe_urls)
                
                # Save all URLs to a file
        print(f"Saving {len(all_urls)} URLs to 'recipes.txt'...")
        save_urls(all_urls, 'recipes.txt')
        print("Done!")


Parsing sitemap index...
Fetching https://preppykitchen.com/post-sitemap.xml...
Parsing https://preppykitchen.com/post-sitemap.xml...
Fetching https://preppykitchen.com/post-sitemap2.xml...
Parsing https://preppykitchen.com/post-sitemap2.xml...
Fetching https://preppykitchen.com/page-sitemap.xml...
Parsing https://preppykitchen.com/page-sitemap.xml...
Fetching https://preppykitchen.com/category-sitemap.xml...
Parsing https://preppykitchen.com/category-sitemap.xml...
Saving 0 URLs to 'recipes.txt'...
Done!


In [49]:
sitemap_url = "https://preppykitchen.com/sitemap_index.xml"
sitemap_xml = fetch_sitemap(sitemap_url)
if sitemap_xml:
        # Parse the sitemap index to get individual sitemap URLs
        print("Parsing sitemap index...")
        sitemaps = parse_sitemap_index(sitemap_xml)

        all_urls = []
        for sitemap_url in sitemaps:
                print(f"Fetching {sitemap_url}...")
                post_sitemap_xml = fetch_sitemap(sitemap_url)
                if post_sitemap_xml:
                        print(f"Parsing {sitemap_url}...")
                        recipe_urls = parse_sitemap(post_sitemap_xml)
                        print(recipe_urls)
                        all_urls.extend(recipe_urls)
                break
                # Save all URLs to a file
        

Parsing sitemap index...
Fetching https://preppykitchen.com/post-sitemap.xml...
Parsing https://preppykitchen.com/post-sitemap.xml...
[]


In [50]:
recipe_urls

[]