In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Sitemap URL
sitemap_url = 'https://www.essex.ac.uk/content.xml'

# Fetch and decode
response = requests.get(sitemap_url)
content = response.content.decode('utf-8', errors='replace')

# Parse XML
try:
    root = ET.fromstring(content)
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    # Extract URLs and lastmod
    data = []
    for url_elem in root.findall('ns:url', ns):
        loc = url_elem.find('ns:loc', ns)
        lastmod = url_elem.find('ns:lastmod', ns)
        data.append({
            'URL': loc.text if loc is not None else '',
            'Last Modified': lastmod.text if lastmod is not None else ''
        })

    df = pd.DataFrame(data)
    print(f"✅ Extracted {len(df)} URLs.")
    print(df.head())

    df.to_csv("essex_content_sitemap.csv", index=False)
    print("Saved to 'essex_content_sitemap.csv'")

except ET.ParseError as e:
    print("❌ XML Parse Error:", e)


✅ Extracted 8295 URLs.
                                            URL Last Modified
0     https://www.essex.ac.uk/research/showcase    2022-11-07
1         https://www.essex.ac.uk/blog/post-map    2023-04-13
2  https://www.essex.ac.uk/life/loughton-campus    2023-11-10
3                  https://www.essex.ac.uk/blog    2024-05-16
4          https://www.essex.ac.uk/blog/student    2024-02-02
Saved to 'essex_content_sitemap.csv'


In [4]:
# Extract the second path segment after the domain
df['Second Prefix'] = df['URL'].apply(lambda url: url.replace('https://www.essex.ac.uk/', '').split('/')[0])

# Show unique second prefixes
unique_prefixes = df['Second Prefix'].unique()
print(f"✅ Found {len(unique_prefixes)} unique second prefixes:")
print(sorted(unique_prefixes))


✅ Found 43 unique second prefixes:
['about', 'alumni', 'apprenticeships', 'arena', 'blog', 'business', 'centres-and-institutes', 'china', 'choir', 'clearing', 'departments', 'disclaimer', 'donate', 'event-series', 'events', 'fees-and-funding', 'global', 'governance-and-strategy', 'graduation', 'international', 'jobs', 'life', 'news', 'people', 'postgraduate', 'research', 'research-projects', 'scholarships', 'schools-and-colleges', 'short-courses', 'sport', 'sport-homepage-test', 'staff', 'student', 'study-abroad', 'study-online', 'subjects', 'sustainability', 'test', 'undergraduate', 'visit-us', 'welcome', 'wivenhoe-park']


In [6]:
prefix_counts = df['Second Prefix'].value_counts()
print("✅ Frequency of each second prefix:")
print(prefix_counts)


✅ Frequency of each second prefix:
Second Prefix
news                       2533
people                     1993
staff                       936
student                     677
departments                 562
research-projects           363
centres-and-institutes      209
international               108
short-courses                96
life                         86
research                     78
postgraduate                 62
scholarships                 60
sport                        55
governance-and-strategy      48
business                     45
schools-and-colleges         44
about                        39
alumni                       38
events                       30
study-abroad                 25
undergraduate                21
test                         20
welcome                      17
jobs                         16
event-series                 15
disclaimer                   14
graduation                   14
blog                         13
sustainability         

In [8]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd

subject_links = []

class EssexSubjectSpider(scrapy.Spider):
    name = "essex_subject_links"
    allowed_domains = ["essex.ac.uk"]
    start_urls = ["https://www.essex.ac.uk/subjects"]

    def parse(self, response):
        for a in response.css("a::attr(href)").getall():
            if a.startswith("/subjects/") and a.count("/") == 2:
                full_url = response.urljoin(a)
                if full_url not in subject_links:
                    subject_links.append(full_url)
                    print(f"✅ Found: {full_url}")

# Run the spider
process = CrawlerProcess(settings={
    "LOG_ENABLED": False,
    "USER_AGENT": "Mozilla/5.0"
})
process.crawl(EssexSubjectSpider)
process.start()

# Save to CSV or view
df = pd.DataFrame(subject_links, columns=["Subject URL"])
df.to_csv("essex_subject_links.csv", index=False)
print(f"\n✅ Total subjects found: {len(df)}. Saved to 'essex_subject_links.csv'")
df.head()


✅ Found: https://www.essex.ac.uk/subjects/accounting
✅ Found: https://www.essex.ac.uk/subjects/acting-producing-stage-management
✅ Found: https://www.essex.ac.uk/subjects/actuarial-science
✅ Found: https://www.essex.ac.uk/subjects/american-us-studies
✅ Found: https://www.essex.ac.uk/subjects/art-history
✅ Found: https://www.essex.ac.uk/subjects/biochemistry
✅ Found: https://www.essex.ac.uk/subjects/biological-sciences
✅ Found: https://www.essex.ac.uk/subjects/biomedical-science
✅ Found: https://www.essex.ac.uk/subjects/business-and-management
✅ Found: https://www.essex.ac.uk/subjects/childhood-studies
✅ Found: https://www.essex.ac.uk/subjects/computational-finance
✅ Found: https://www.essex.ac.uk/subjects/computer-science
✅ Found: https://www.essex.ac.uk/subjects/creative-writing
✅ Found: https://www.essex.ac.uk/subjects/criminology
✅ Found: https://www.essex.ac.uk/subjects/data-analytics
✅ Found: https://www.essex.ac.uk/subjects/drama
✅ Found: https://www.essex.ac.uk/subjects/economic

Unnamed: 0,Subject URL
0,https://www.essex.ac.uk/subjects/accounting
1,https://www.essex.ac.uk/subjects/acting-produc...
2,https://www.essex.ac.uk/subjects/actuarial-sci...
3,https://www.essex.ac.uk/subjects/american-us-s...
4,https://www.essex.ac.uk/subjects/art-history
