In [None]:
import json
import requests
import xml.etree.ElementTree as ET

def save_urls_to_json(urls, filename):
    """Save the categorized URLs to a JSON file."""
    with open(filename, 'w') as file:
        json.dump(urls, file, indent=4)

def parse_sitemap_index(sitemap_xml):
    """ Parse the XML sitemap index to extract URLs of individual sitemaps """
    urls = []
    try:
        # Parse the sitemap index XML
        root = ET.fromstring(sitemap_xml)
        
        # Define the namespace to search for elements with the correct namespace
        namespace = {'sitemap': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # Find all <loc> elements under <sitemap> tags
        for sitemap in root.findall('.//sitemap:sitemap/sitemap:loc', namespace):
            urls.append(sitemap.text)
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    
    return urls

def parse_sitemap(sitemap_xml):
    """ Parse the XML sitemap to extract all URLs """
    urls = []
    try:
        root = ET.fromstring(sitemap_xml)
        print('root')
        print(root)
        
        # Define the namespace with a prefix (e.g., 'ns')
        namespaces = {
            'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'
        }
        
        print(root.findall('.//ns:url', namespaces))


        # Find all <url> elements and their <loc> children using the namespace prefix
        for url in root.findall('.//ns:url', namespaces):
            loc = url.find('ns:loc', namespaces)
            if loc is not None and loc.text:
                urls.append(loc.text.strip())  # Strip to remove leading/trailing whitespace

    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    
    return urls

def parse_sitemap(sitemap_xml):
    """ Parse the XML sitemap to extract all URLs (without namespaces) """
    urls = []
    try:
        root = ET.fromstring(sitemap_xml)

        # No namespace — use plain tag names
        for url in root.findall('.//url'):
            loc = url.find('loc')
            if loc is not None and loc.text:
                urls.append(loc.text.strip())

    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    
    return urls



def fetch_sitemap(url):
    """ Fetch the sitemap content from a URL """
    try:
        # Send a GET request to the base page
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [52]:
sitemap_xml = fetch_sitemap("https://www.hellofresh.com/sitemap_recipe_pages.xml")
# sitemap_xml = fetch_sitemap("https://www.hellofresh.com/sitemap_recipe_collections.xml")

x = parse_sitemap(sitemap_xml)
print(x)

sitemap_xml = fetch_sitemap("https://www.hellofresh.com/sitemap_recipe_collections.xml")

x = parse_sitemap(sitemap_xml)
print(x)

root
<Element '{https://www.sitemaps.org/schemas/sitemap/0.9}urlset' at 0x000002322B16EFC0>
[]
[]
root
<Element '{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' at 0x0000023229EEBE20>
[<Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EE980>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EEFC0>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EFA10>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EC590>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EC450>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3ED030>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EEA20>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B3EF4C0>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}url' at 0x000002322B469260>, <Element '{http://www.sitemaps.org/schemas/sitemap/0.9

In [38]:
# Fetch the sitemap index
sitemap_url = "https://www.hellofresh.com/sitemap_index.xml"

sitemap_xml = fetch_sitemap(sitemap_url)


if sitemap_xml:
    # Parse the sitemap index to get individual sitemap URLs
    print("Parsing sitemap index...")
    sitemaps = parse_sitemap_index(sitemap_xml)

    categorized_urls = {
        "sitemap_transactional_pages": [],
        "sitemap_recipe_collections": [],
        "sitemap_recipe_pages": [],
        "sitemap_others" : []
    }

    for sitemap_url in sitemaps:
        print(f"Fetching {sitemap_url}...")
        sitemap_xml = fetch_sitemap(sitemap_url)
        if sitemap_xml:
            print(f"Parsing {sitemap_url}...")
            if "sitemap_transactional_pages" in sitemap_url:
                categorized_urls["sitemap_transactional_pages"].extend(parse_sitemap(sitemap_xml))
            elif "sitemap_recipe_collections" in sitemap_url:
                categorized_urls["sitemap_recipe_collections"].extend(parse_sitemap(sitemap_xml))
            elif "sitemap_recipe_pages" in sitemap_url:
                categorized_urls["sitemap_recipe_pages"].extend(parse_sitemap(sitemap_xml))
            elif "others" in sitemap_url:
                categorized_urls["sitemap_others"].extend(parse_sitemap(sitemap_xml))

    # Save all categorized URLs to a JSON file
    print(f"Saving categorized URLs to 'dataScraped/sitemapURLSHelloFresh.json'...")
    save_urls_to_json(categorized_urls, 'dataScraped/sitemapURLSHelloFresh.json')
    print("Done!")

Parsing sitemap index...
Fetching https://www.hellofresh.com/sitemap_transactional_pages.xml...
Parsing https://www.hellofresh.com/sitemap_transactional_pages.xml...
Fetching https://www.hellofresh.com/sitemap_recipe_collections.xml...
Parsing https://www.hellofresh.com/sitemap_recipe_collections.xml...
Fetching https://www.hellofresh.com/sitemap_recipe_pages.xml...
Parsing https://www.hellofresh.com/sitemap_recipe_pages.xml...
Fetching https://www.hellofresh.com/sitemap_others.xml...
Parsing https://www.hellofresh.com/sitemap_others.xml...
Saving categorized URLs to 'dataScraped/sitemapURLSHelloFresh.json'...
Done!


In [12]:
print(sitemap_url)
# print(sitemap_xml)
print(sitemaps)
# print(categorized_urls)
for sitemap_url in sitemaps:
    print(sitemap_url)
x = fetch_sitemap("https://www.hellofresh.com/sitemap_recipe_pages.xml")
print(x)

https://www.hellofresh.com/sitemap_others.xml
['https://www.hellofresh.com/sitemap_transactional_pages.xml', 'https://www.hellofresh.com/sitemap_recipe_collections.xml', 'https://www.hellofresh.com/sitemap_recipe_pages.xml', 'https://www.hellofresh.com/sitemap_others.xml']
https://www.hellofresh.com/sitemap_transactional_pages.xml
https://www.hellofresh.com/sitemap_recipe_collections.xml
https://www.hellofresh.com/sitemap_recipe_pages.xml
https://www.hellofresh.com/sitemap_others.xml
<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
  <url>
    <loc>https://www.hellofresh.com/recipes/strawberry-swirl-cheesecake-681a5dd2d57dc9d107c4aa5e</loc>
    <lastmod>2025-05-06</lastmod>
  </url>
  <url>
    <loc>https://www.hellofresh.com/recipes/chocolate-and-vanilla-macarons-681a5c9dd57dc9d107c4a916</loc>
    <lastmod>2025-05-06</lastmod>
  </url>
  <url>
    <loc>https://www.hellofresh.com/recipes/cheesy-chicken-and-broccoli-68198cb634c41957efda3fb5</loc>
    <lastmod>2025-05-07</la

[]
