In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [6]:
def download_links_from_sitemap(sitemap_url):
    """
    Downloads all links from a sitemap URL and returns them in a pandas DataFrame.

    Args:
        sitemap_url (str): The URL of the sitemap XML file.

    Returns:
        pandas.DataFrame: A DataFrame with a single 'url' column containing the links.
    """
    print(f"Fetching sitemap from: {sitemap_url}")
    try:
        # 1. Fetch and parse the sitemap to get all page URLs
        sitemap_response = requests.get(sitemap_url)
        sitemap_response.raise_for_status()  # Ensure the request was successful
        sitemap_soup = BeautifulSoup(sitemap_response.content, 'xml')
        
        # The URLs are within <loc> tags in the XML
        urls = [loc.text for loc in sitemap_soup.find_all('loc')]
        print(f"Found {len(urls)} URLs in the sitemap.")

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch sitemap: {e}")
        return pd.DataFrame({'url': []}) # Return an empty DataFrame on failure

    # 2. Create a pandas DataFrame directly from the list of URLs
    df = pd.DataFrame({'url': urls})
    print("\nLink download complete!")
    return df


### Dowmload page 5 & 6
- select filter for anzsco and year 2021 & 2022
- split the url by anzsco 1dig, 2dig, 4dig $ 6dig
- remove the duplicates consider pirortising year 2022
- save the url's into df for docling

In [23]:
# The specific sitemap URL from your request
SITEMAP_PAGE_6 = "https://www.abs.gov.au/sitemap.xml?page=6"

# Run the function
links_df = download_links_from_sitemap(SITEMAP_PAGE_6)

# Display the first 5 rows of the DataFrame
print("\n--- DataFrame Head ---")
print(links_df.head())

Fetching sitemap from: https://www.abs.gov.au/sitemap.xml?page=6
Found 2000 URLs in the sitemap.

Link download complete!

--- DataFrame Head ---
                                                 url
0  https://www.abs.gov.au/statistics/classificati...
1  https://www.abs.gov.au/statistics/classificati...
2  https://www.abs.gov.au/statistics/classificati...
3  https://www.abs.gov.au/statistics/classificati...
4  https://www.abs.gov.au/statistics/classificati...


In [24]:
anzsco_df = links_df[links_df['url'].str.contains('anzsco', case=False, na=False)]
len(anzsco_df)


132

In [25]:
anzsco_22_df = anzsco_df[anzsco_df['url'].str.contains('2022', case=False, na=False)]
len(anzsco_22_df)

132

In [26]:
anzsco_22_df.to_csv('../data/anzsco_22_links.csv', index=False)

In [27]:
print(anzsco_22_df['url'].iloc[0])

https://www.abs.gov.au/statistics/classifications/anzsco-australian-and-new-zealand-standard-classification-occupations/2022/browse-classification/5/59/599/5994


In [2]:

def scrape_abs_sitemap(sitemap_url):
    """
    Scrapes a specific sitemap URL from the Australian Bureau of Statistics,
    extracts the content from each page listed, and returns a pandas DataFrame.

    Args:
        sitemap_url (str): The URL of the sitemap XML file.

    Returns:
        pandas.DataFrame: A DataFrame with 'url' and 'content' columns.
    """
    print(f"Fetching sitemap from: {sitemap_url}")
    try:
        # 1. Fetch and parse the sitemap to get all page URLs
        sitemap_response = requests.get(sitemap_url)
        sitemap_response.raise_for_status()  # Ensure the request was successful
        sitemap_soup = BeautifulSoup(sitemap_response.content, 'xml')
        
        # The URLs are within <loc> tags in the XML
        urls = [loc.text for loc in sitemap_soup.find_all('loc')]
        print(f"Found {len(urls)} URLs in the sitemap.")

        # 2. Scrape the content from each URL
        data = []
        for i, url in enumerate(urls):
            print(f"Scraping ({i+1}/{len(urls)}): {url}")
            try:
                page_response = requests.get(url, timeout=10)
                page_response.raise_for_status()
                
                page_soup = BeautifulSoup(page_response.content, 'html.parser')
                
                # The main document content on ABS pages is typically within
                # an <article> tag with the class 'main-content'.
                content_area = page_soup.find('article', class_='main-content')
                
                if content_area:
                    # Extract text, using a space as a separator and stripping whitespace
                    content_text = content_area.get_text(separator=' ', strip=True)
                else:
                    content_text = "Main content area not found."
                    
                data.append({'url': url, 'content': content_text})

            except requests.exceptions.RequestException as e:
                print(f"  -> Failed to retrieve {url}: {e}")
                data.append({'url': url, 'content': f"Error: Could not retrieve page."})
            
            # Be polite and avoid overwhelming the server
            time.sleep(0.5)

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch sitemap: {e}")
        return pd.DataFrame() # Return an empty DataFrame on failure

    # 3. Create a pandas DataFrame
    df = pd.DataFrame(data)
    print("\nScraping complete!")
    return df

In [None]:

# The specific sitemap URL from your request
SITEMAP_PAGE_5 = "https://www.abs.gov.au/sitemap.xml?page=5"

# Run the scraper
abs_df = scrape_abs_sitemap(SITEMAP_PAGE_5)

# Display the first 5 rows of the DataFrame
print("\n--- DataFrame Head ---")
print(abs_df.head())