In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Sitemap URL
sitemap_url = 'https://www.essex.ac.uk/content.xml'

# Fetch and decode
response = requests.get(sitemap_url)
content = response.content.decode('utf-8', errors='replace')

# Parse XML
try:
    root = ET.fromstring(content)
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    # Extract URLs and lastmod
    data = []
    for url_elem in root.findall('ns:url', ns):
        loc = url_elem.find('ns:loc', ns)
        lastmod = url_elem.find('ns:lastmod', ns)
        data.append({
            'URL': loc.text if loc is not None else '',
            'Last Modified': lastmod.text if lastmod is not None else ''
        })

    df = pd.DataFrame(data)
    print(f"✅ Extracted {len(df)} URLs.")
    print(df.head())

    df.to_csv("essex_content_sitemap.csv", index=False)
    print("Saved to 'essex_content_sitemap.csv'")

except ET.ParseError as e:
    print("❌ XML Parse Error:", e)


In [None]:
# Extract the second path segment after the domain
df['Second Prefix'] = df['URL'].apply(lambda url: url.replace('https://www.essex.ac.uk/', '').split('/')[0])

# Show unique second prefixes
unique_prefixes = df['Second Prefix'].unique()
print(f"✅ Found {len(unique_prefixes)} unique second prefixes:")
print(sorted(unique_prefixes))


In [None]:
prefix_counts = df['Second Prefix'].value_counts()
print("✅ Frequency of each second prefix:")
print(prefix_counts)


In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd

subject_links = []

class EssexSubjectSpider(scrapy.Spider):
    name = "essex_subject_links"
    allowed_domains = ["essex.ac.uk"]
    start_urls = ["https://www.essex.ac.uk/subjects"]

    def parse(self, response):
        for a in response.css("a::attr(href)").getall():
            if a.startswith("/subjects/") and a.count("/") == 2:
                full_url = response.urljoin(a)
                if full_url not in subject_links:
                    subject_links.append(full_url)
                    print(f"✅ Found: {full_url}")

# Run the spider
process = CrawlerProcess(settings={
    "LOG_ENABLED": False,
    "USER_AGENT": "Mozilla/5.0"
})
process.crawl(EssexSubjectSpider)
process.start()

# Save to CSV or view
df = pd.DataFrame(subject_links, columns=["Subject URL"])
df.to_csv("essex_subject_links.csv", index=False)
print(f"\n✅ Total subjects found: {len(df)}. Saved to 'essex_subject_links.csv'")
df.head()
