In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re

# Sitemap URL
sitemap_url = 'https://www.essex.ac.uk/content.xml'

# Fetch and decode
response = requests.get(sitemap_url)
content = response.content.decode('utf-8', errors='replace')

# Parse XML
try:
    root = ET.fromstring(content)
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    # Extract URLs and lastmod
    data = []
    for url_elem in root.findall('ns:url', ns):
        loc = url_elem.find('ns:loc', ns)
        lastmod = url_elem.find('ns:lastmod', ns)
        data.append({
            'URL': loc.text if loc is not None else '',
            'Last Modified': lastmod.text if lastmod is not None else ''
        })

    df = pd.DataFrame(data)
    print(f"✅ Extracted {len(df)} URLs.")
    
    # Extract the second path segment after the domain
    df['Second Prefix'] = df['URL'].apply(lambda url: url.replace('https://www.essex.ac.uk/', '').split('/')[0])
    
    # Save the main CSV file with all data and new name
    main_filename = "essex_content_sitemap_2nd_Prefix.csv"
    df.to_csv(main_filename, index=False)
    print(f"Saved main data to '{main_filename}'")
    
    # Group by 'Second Prefix' and save each group to a CSV
    for prefix, group_df in df.groupby('Second Prefix'):
        # Sanitize prefix to create a valid filename
        safe_prefix = re.sub(r'[^a-zA-Z0-9_-]', '_', prefix)
        filename = f"essex_urls_{safe_prefix}.csv"
        group_df.to_csv(filename, index=False)
        print(f"✅ Saved {len(group_df)} URLs to {filename}")

except ET.ParseError as e:
    print("❌ XML Parse Error:", e)

✅ Extracted 8317 URLs.
Saved main data to 'essex_content_sitemap_2nd_Prefix.csv'
✅ Saved 39 URLs to essex_urls_about.csv
✅ Saved 38 URLs to essex_urls_alumni.csv
✅ Saved 2 URLs to essex_urls_apprenticeships.csv
✅ Saved 5 URLs to essex_urls_arena.csv
✅ Saved 13 URLs to essex_urls_blog.csv
✅ Saved 45 URLs to essex_urls_business.csv
✅ Saved 209 URLs to essex_urls_centres-and-institutes.csv
✅ Saved 6 URLs to essex_urls_china.csv
✅ Saved 6 URLs to essex_urls_choir.csv
✅ Saved 7 URLs to essex_urls_clearing.csv
✅ Saved 562 URLs to essex_urls_departments.csv
✅ Saved 14 URLs to essex_urls_disclaimer.csv
✅ Saved 9 URLs to essex_urls_donate.csv
✅ Saved 15 URLs to essex_urls_event-series.csv
✅ Saved 30 URLs to essex_urls_events.csv
✅ Saved 1 URLs to essex_urls_fees-and-funding.csv
✅ Saved 12 URLs to essex_urls_global.csv
✅ Saved 49 URLs to essex_urls_governance-and-strategy.csv
✅ Saved 14 URLs to essex_urls_graduation.csv
✅ Saved 108 URLs to essex_urls_international.csv
✅ Saved 16 URLs to essex_ur

In [7]:
import pandas as pd
from urllib.parse import urlparse

# Read existing CSV
df = pd.read_csv("essex_content_sitemap_2nd_Prefix.csv")

# Extract path segments from URLs
def get_hierarchy(url):
    parsed = urlparse(url)
    return [seg for seg in parsed.path.split('/') if seg]

df['Path_Segments'] = df['URL'].apply(get_hierarchy)

# Determine maximum depth needed
max_depth = df['Path_Segments'].apply(len).max()

# Create hierarchical columns
for i in range(max_depth):
    df[f'Level_{i+1}'] = df['Path_Segments'].apply(
        lambda x: x[i] if i < len(x) else pd.NA
    )

# Create final dataframe with desired columns
hierarchy_df = df[['Last Modified', 'URL'] + [f'Level_{i+1}' for i in range(max_depth)]]

# Save to new CSV
hierarchy_df.to_csv("essex_url_hierarchy_with_dates.csv", index=False)

print("✅ Hierarchical CSV created with columns:")
print(hierarchy_df.columns.tolist())
print(f"\nSample output:\n{hierarchy_df.head(3)}")

✅ Hierarchical CSV created with columns:
['Last Modified', 'URL', 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5', 'Level_6', 'Level_7', 'Level_8']

Sample output:
  Last Modified                                           URL   Level_1  \
0    2022-11-07     https://www.essex.ac.uk/research/showcase  research   
1    2023-04-13         https://www.essex.ac.uk/blog/post-map      blog   
2    2023-11-10  https://www.essex.ac.uk/life/loughton-campus      life   

           Level_2 Level_3 Level_4 Level_5 Level_6 Level_7 Level_8  
0         showcase    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>  
1         post-map    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>  
2  loughton-campus    <NA>    <NA>    <NA>    <NA>    <NA>    <NA>  
