In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Sitemap URL
sitemap_url = 'https://www.essex.ac.uk/content.xml'

# Fetch and decode
response = requests.get(sitemap_url)
content = response.content.decode('utf-8', errors='replace')

# Parse XML
try:
    root = ET.fromstring(content)
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    # Extract URLs and lastmod
    data = []
    for url_elem in root.findall('ns:url', ns):
        loc = url_elem.find('ns:loc', ns)
        lastmod = url_elem.find('ns:lastmod', ns)
        data.append({
            'URL': loc.text if loc is not None else '',
            'Last Modified': lastmod.text if lastmod is not None else ''
        })

    df = pd.DataFrame(data)
    print(f"✅ Extracted {len(df)} URLs.")
    print(df.head())

    df.to_csv("essex_content_sitemap.csv", index=False)
    print("Saved to 'essex_content_sitemap.csv'")

except ET.ParseError as e:
    print("❌ XML Parse Error:", e)


✅ Extracted 7965 URLs.
                                          URL Last Modified
0   https://www.essex.ac.uk/research/showcase    2022-11-07
1       https://www.essex.ac.uk/blog/post-map    2023-04-13
2     https://www.essex.ac.uk/blog/staff/tags    2024-10-29
3  https://www.essex.ac.uk/blog/staff/authors    2024-10-29
4          https://www.essex.ac.uk/blog/staff    2024-11-14
Saved to 'essex_content_sitemap.csv'


In [3]:
# Extract the second path segment after the domain
df['Second Prefix'] = df['URL'].apply(lambda url: url.replace('https://www.essex.ac.uk/', '').split('/')[0])

# Show unique second prefixes
unique_prefixes = df['Second Prefix'].unique()
print(f"✅ Found {len(unique_prefixes)} unique second prefixes:")
print(sorted(unique_prefixes))


✅ Found 43 unique second prefixes:
['about', 'alumni', 'apprenticeships', 'arena', 'blog', 'business', 'centres-and-institutes', 'china', 'choir', 'clearing', 'departments', 'disclaimer', 'donate', 'event-series', 'events', 'fees-and-funding', 'global', 'governance-and-strategy', 'graduation', 'international', 'jobs', 'life', 'news', 'people', 'postgraduate', 'research', 'research-projects', 'scholarships', 'schools-and-colleges', 'short-courses', 'sport', 'sport-homepage-test', 'staff', 'student', 'study-abroad', 'study-online', 'subjects', 'sustainability', 'test', 'undergraduate', 'visit-us', 'welcome', 'wivenhoe-park']


In [4]:
prefix_counts = df['Second Prefix'].value_counts()
print("✅ Frequency of each second prefix:")
print(prefix_counts)


✅ Frequency of each second prefix:
Second Prefix
news                       2536
people                     1664
staff                       936
student                     677
departments                 562
research-projects           363
centres-and-institutes      209
international               108
short-courses                91
life                         86
research                     78
postgraduate                 62
scholarships                 60
sport                        55
governance-and-strategy      49
business                     45
schools-and-colleges         44
about                        39
alumni                       38
events                       30
study-abroad                 25
undergraduate                21
test                         20
welcome                      17
jobs                         16
event-series                 15
graduation                   14
disclaimer                   14
blog                         13
sustainability         

In [5]:
!pip install selenium webdriver-manager



In [6]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd

subject_links = []

class EssexSubjectSpider(scrapy.Spider):
    name = "essex_subject_links"
    allowed_domains = ["essex.ac.uk"]
    start_urls = ["https://www.essex.ac.uk/subjects"]

    def parse(self, response):
        for a in response.css("a::attr(href)").getall():
            if a.startswith("/subjects/") and a.count("/") == 2:
                full_url = response.urljoin(a)
                if full_url not in subject_links:
                    subject_links.append(full_url)
                    print(f"✅ Found: {full_url}")

# Run the spider
process = CrawlerProcess(settings={
    "LOG_ENABLED": False,
    "USER_AGENT": "Mozilla/5.0"
})
process.crawl(EssexSubjectSpider)
process.start()

# Save to CSV or view
df = pd.DataFrame(subject_links, columns=["Subject URL"])
df.to_csv("essex_subject_links.csv", index=False)
print(f"\n✅ Total subjects found: {len(df)}. Saved to 'essex_subject_links.csv'")
df.head()


✅ Found: https://www.essex.ac.uk/subjects/accounting
✅ Found: https://www.essex.ac.uk/subjects/acting-producing-stage-management
✅ Found: https://www.essex.ac.uk/subjects/actuarial-science
✅ Found: https://www.essex.ac.uk/subjects/american-us-studies
✅ Found: https://www.essex.ac.uk/subjects/art-history
✅ Found: https://www.essex.ac.uk/subjects/biochemistry
✅ Found: https://www.essex.ac.uk/subjects/biological-sciences
✅ Found: https://www.essex.ac.uk/subjects/biomedical-science
✅ Found: https://www.essex.ac.uk/subjects/business-and-management
✅ Found: https://www.essex.ac.uk/subjects/childhood-studies
✅ Found: https://www.essex.ac.uk/subjects/computational-finance
✅ Found: https://www.essex.ac.uk/subjects/computer-science
✅ Found: https://www.essex.ac.uk/subjects/creative-writing
✅ Found: https://www.essex.ac.uk/subjects/criminology
✅ Found: https://www.essex.ac.uk/subjects/data-analytics
✅ Found: https://www.essex.ac.uk/subjects/drama
✅ Found: https://www.essex.ac.uk/subjects/economic

Unnamed: 0,Subject URL
0,https://www.essex.ac.uk/subjects/accounting
1,https://www.essex.ac.uk/subjects/acting-produc...
2,https://www.essex.ac.uk/subjects/actuarial-sci...
3,https://www.essex.ac.uk/subjects/american-us-s...
4,https://www.essex.ac.uk/subjects/art-history


In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time

# Initialize Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.essex.ac.uk/subjects/business-and-management'
driver.get(url)

# Handle cookie consent
try:
    cookie_accept = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
    )
    cookie_accept.click()
    print("Accepted cookies")
except Exception as e:
    print("No cookie popup found or could not accept cookies")

# List to store all course URLs
all_course_urls = []

# Tab IDs for Undergraduate, Masters, and Research
tab_ids = ['ug', 'pg', 'pgr']

try:
    for tab_id in tab_ids:
        # Switch to current tab
        tab = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, f"a[href='#{tab_id}']"))
        )
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", tab)
        driver.execute_script("arguments[0].click();", tab)
        print(f"Switched to {tab_id.upper()} tab")
        time.sleep(2)  # Allow tab content to load
        
        # Handle "Load More" button
        while True:
            try:
                # Find and click "Load More" using JavaScript
                load_more = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#load-more .button"))
                )
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", load_more)
                driver.execute_script("arguments[0].click();", load_more)
                print("Clicked Load More button")
                time.sleep(2)  # Wait for new content to load
            except (NoSuchElementException, TimeoutException):
                print("No more 'Load More' button or reached end")
                break
        
        # Collect all course links
        course_cards = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.course-search-card"))
        )
        tab_urls = [card.get_attribute("href") for card in course_cards]
        all_course_urls.extend(tab_urls)
        print(f"Found {len(tab_urls)} courses in {tab_id.upper()} tab")

except Exception as e:
    print(f"Error occurred: {str(e)}")
finally:
    driver.quit()

# Remove duplicates and print results
unique_urls = list(set(all_course_urls))
print(f"\nTotal unique courses found: {len(unique_urls)}")
for url in unique_urls:
    print(url)

Accepted cookies
Switched to UG tab
Clicked Load More button
Clicked Load More button
Clicked Load More button
Clicked Load More button
Clicked Load More button
Clicked Load More button
No more 'Load More' button or reached end
Found 54 courses in UG tab
Switched to PG tab
No more 'Load More' button or reached end
Found 54 courses in PG tab
Switched to PGR tab
No more 'Load More' button or reached end
Found 54 courses in PGR tab

Total unique courses found: 54
https://www.essex.ac.uk/courses/UG00043/1/BBA-Business-Administration
https://www.essex.ac.uk/courses/PG00595/1/MSc-Global-Project-Management
https://www.essex.ac.uk/courses/PR00919/1/Professional-Doctorate-Health-Service-Management
https://www.essex.ac.uk/courses/UG00047/1/BA-Business-Management-with-a-Modern-Language
https://www.essex.ac.uk/courses/PG01564/1/MSc-Leadership-in-Health-and-Care
https://www.essex.ac.uk/courses/UG00256/2/BSc-Marketing-Management
https://www.essex.ac.uk/courses/UG01376/1/BA-Philosophy-with-Business-M