In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Sitemap URL
sitemap_url = 'https://www.essex.ac.uk/content.xml'

# Fetch and decode
response = requests.get(sitemap_url)
content = response.content.decode('utf-8', errors='replace')

# Parse XML
try:
    root = ET.fromstring(content)
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

    # Extract URLs and lastmod
    data = []
    for url_elem in root.findall('ns:url', ns):
        loc = url_elem.find('ns:loc', ns)
        lastmod = url_elem.find('ns:lastmod', ns)
        data.append({
            'URL': loc.text if loc is not None else '',
            'Last Modified': lastmod.text if lastmod is not None else ''
        })

    df = pd.DataFrame(data)
    print(f"✅ Extracted {len(df)} URLs.")
    print(df.head())

    df.to_csv("essex_content_sitemap.csv", index=False)
    print("Saved to 'essex_content_sitemap.csv'")

except ET.ParseError as e:
    print("❌ XML Parse Error:", e)


✅ Extracted 8317 URLs.
                                            URL Last Modified
0     https://www.essex.ac.uk/research/showcase    2022-11-07
1         https://www.essex.ac.uk/blog/post-map    2023-04-13
2  https://www.essex.ac.uk/life/loughton-campus    2023-11-10
3                  https://www.essex.ac.uk/blog    2024-05-16
4          https://www.essex.ac.uk/blog/student    2024-02-02
Saved to 'essex_content_sitemap.csv'


In [4]:
# Extract the second path segment after the domain
df['Second Prefix'] = df['URL'].apply(lambda url: url.replace('https://www.essex.ac.uk/', '').split('/')[0])

# Show unique second prefixes
unique_prefixes = df['Second Prefix'].unique()
print(f"✅ Found {len(unique_prefixes)} unique second prefixes:")
print(sorted(unique_prefixes))


✅ Found 43 unique second prefixes:
['about', 'alumni', 'apprenticeships', 'arena', 'blog', 'business', 'centres-and-institutes', 'china', 'choir', 'clearing', 'departments', 'disclaimer', 'donate', 'event-series', 'events', 'fees-and-funding', 'global', 'governance-and-strategy', 'graduation', 'international', 'jobs', 'life', 'news', 'people', 'postgraduate', 'research', 'research-projects', 'scholarships', 'schools-and-colleges', 'short-courses', 'sport', 'sport-homepage-test', 'staff', 'student', 'study-abroad', 'study-online', 'subjects', 'sustainability', 'test', 'undergraduate', 'visit-us', 'welcome', 'wivenhoe-park']


In [6]:
prefix_counts = df['Second Prefix'].value_counts()
print("✅ Frequency of each second prefix:")
print(prefix_counts)


✅ Frequency of each second prefix:
Second Prefix
news                       2537
people                     2015
staff                       936
student                     677
departments                 562
research-projects           363
centres-and-institutes      209
international               108
short-courses                91
life                         86
research                     78
postgraduate                 62
scholarships                 60
sport                        55
governance-and-strategy      49
business                     45
schools-and-colleges         44
about                        39
alumni                       38
events                       30
study-abroad                 25
undergraduate                21
test                         20
welcome                      17
jobs                         16
event-series                 15
graduation                   14
disclaimer                   14
blog                         13
sustainability         

In [None]:
!pip install selenium webdriver-manager

In [8]:
# import scrapy
# from scrapy.crawler import CrawlerProcess
# import pandas as pd

# subject_links = []

# class EssexSubjectSpider(scrapy.Spider):
#     name = "essex_subject_links"
#     allowed_domains = ["essex.ac.uk"]
#     start_urls = ["https://www.essex.ac.uk/subjects"]

#     def parse(self, response):
#         for a in response.css("a::attr(href)").getall():
#             if a.startswith("/subjects/") and a.count("/") == 2:
#                 full_url = response.urljoin(a)
#                 if full_url not in subject_links:
#                     subject_links.append(full_url)
#                     print(f"✅ Found: {full_url}")

# # Run the spider
# process = CrawlerProcess(settings={
#     "LOG_ENABLED": False,
#     "USER_AGENT": "Mozilla/5.0"
# })
# process.crawl(EssexSubjectSpider)
# process.start()

# # Save to CSV or view
# df = pd.DataFrame(subject_links, columns=["Subject URL"])
# df.to_csv("essex_subject_links.csv", index=False)
# print(f"\n✅ Total subjects found: {len(df)}. Saved to 'essex_subject_links.csv'")
# df.head()


✅ Found: https://www.essex.ac.uk/subjects/accounting
✅ Found: https://www.essex.ac.uk/subjects/acting-producing-stage-management
✅ Found: https://www.essex.ac.uk/subjects/actuarial-science
✅ Found: https://www.essex.ac.uk/subjects/american-us-studies
✅ Found: https://www.essex.ac.uk/subjects/art-history
✅ Found: https://www.essex.ac.uk/subjects/biochemistry
✅ Found: https://www.essex.ac.uk/subjects/biological-sciences
✅ Found: https://www.essex.ac.uk/subjects/biomedical-science
✅ Found: https://www.essex.ac.uk/subjects/business-and-management
✅ Found: https://www.essex.ac.uk/subjects/childhood-studies
✅ Found: https://www.essex.ac.uk/subjects/computational-finance
✅ Found: https://www.essex.ac.uk/subjects/computer-science
✅ Found: https://www.essex.ac.uk/subjects/creative-writing
✅ Found: https://www.essex.ac.uk/subjects/criminology
✅ Found: https://www.essex.ac.uk/subjects/data-analytics
✅ Found: https://www.essex.ac.uk/subjects/drama
✅ Found: https://www.essex.ac.uk/subjects/economic

Unnamed: 0,Subject URL
0,https://www.essex.ac.uk/subjects/accounting
1,https://www.essex.ac.uk/subjects/acting-produc...
2,https://www.essex.ac.uk/subjects/actuarial-sci...
3,https://www.essex.ac.uk/subjects/american-us-s...
4,https://www.essex.ac.uk/subjects/art-history


In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urlparse
import pandas as pd

subject_links = set()

# Set up Chrome driver (you'll need chromedriver installed)
driver = webdriver.Chrome()
driver.implicitly_wait(10)  # Wait for elements to load
driver.get("https://www.essex.ac.uk/subjects")

# Find all anchor elements
anchors = driver.find_elements(By.TAG_NAME, 'a')

for a in anchors:
    href = a.get_attribute("href")
    if href and href.startswith("https://www.essex.ac.uk/subjects/"):
        # Verify path structure using urlparse
        parsed = urlparse(href)
        path_parts = parsed.path.split('/')
        
        # Check for exactly 2 path segments after domain (['', 'subjects', 'subject-name'])
        if len(path_parts) == 3:  # 3 parts because of the leading empty string
            subject_links.add(href)
            print(f"✅ Found: {href}")

driver.quit()

# Save to CSV
df = pd.DataFrame(sorted(subject_links), columns=["Subject URL"])
df.to_csv("essex_subject_links_selenium.csv", index=False)
print(f"\n✅ Total subjects found: {len(df)}. Saved to 'essex_subject_links_selenium.csv'")
df.head()

✅ Found: https://www.essex.ac.uk/subjects/accounting
✅ Found: https://www.essex.ac.uk/subjects/acting-producing-stage-management
✅ Found: https://www.essex.ac.uk/subjects/actuarial-science
✅ Found: https://www.essex.ac.uk/subjects/american-us-studies
✅ Found: https://www.essex.ac.uk/subjects/art-history
✅ Found: https://www.essex.ac.uk/subjects/biochemistry
✅ Found: https://www.essex.ac.uk/subjects/biological-sciences
✅ Found: https://www.essex.ac.uk/subjects/biomedical-science
✅ Found: https://www.essex.ac.uk/subjects/business-and-management
✅ Found: https://www.essex.ac.uk/subjects/childhood-studies
✅ Found: https://www.essex.ac.uk/subjects/computational-finance
✅ Found: https://www.essex.ac.uk/subjects/computer-science
✅ Found: https://www.essex.ac.uk/subjects/creative-writing
✅ Found: https://www.essex.ac.uk/subjects/criminology
✅ Found: https://www.essex.ac.uk/subjects/data-analytics
✅ Found: https://www.essex.ac.uk/subjects/drama
✅ Found: https://www.essex.ac.uk/subjects/economic

Unnamed: 0,Subject URL
0,https://www.essex.ac.uk/subjects/accounting
1,https://www.essex.ac.uk/subjects/acting-produc...
2,https://www.essex.ac.uk/subjects/actuarial-sci...
3,https://www.essex.ac.uk/subjects/american-us-s...
4,https://www.essex.ac.uk/subjects/art-history


In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time

# Initialize Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.essex.ac.uk/subjects/european-studies'
driver.get(url)

# Handle cookie consent
try:
    cookie_accept = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
    )
    cookie_accept.click()
    print("Accepted cookies")
except Exception as e:
    print("No cookie popup found or could not accept cookies")

# List to store all course URLs
all_course_urls = []

# Tab IDs for Undergraduate, Masters, and Research
tab_ids = ['ug', 'pg', 'pgr']

try:
    for tab_id in tab_ids:
        # Switch to current tab
        tab = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, f"a[href='#{tab_id}']"))
        )
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", tab)
        driver.execute_script("arguments[0].click();", tab)
        print(f"Switched to {tab_id.upper()} tab")
        time.sleep(2)  # Allow tab content to load
        
        # Handle "Load More" button
        while True:
            try:
                # Find and click "Load More" using JavaScript
                load_more = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#load-more .button"))
                )
                driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", load_more)
                driver.execute_script("arguments[0].click();", load_more)
                print("Clicked Load More button")
                time.sleep(2)  # Wait for new content to load
            except (NoSuchElementException, TimeoutException):
                print("No more 'Load More' button or reached end")
                break
        
        # Collect all course links
        course_cards = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.course-search-card"))
        )
        tab_urls = [card.get_attribute("href") for card in course_cards]
        all_course_urls.extend(tab_urls)
        print(f"Found {len(tab_urls)} courses in {tab_id.upper()} tab")

except Exception as e:
    print(f"Error occurred: {str(e)}")
finally:
    driver.quit()

# Remove duplicates and print results
unique_urls = list(set(all_course_urls))
print(f"\nTotal unique courses found: {len(unique_urls)}")
for url in unique_urls:
    print(url)

Accepted cookies
Switched to UG tab
Clicked Load More button
No more 'Load More' button or reached end
Found 10 courses in UG tab
Error occurred: Message: 
Stacktrace:
0   chromedriver                        0x000000010eaaa8b8 chromedriver + 5986488
1   chromedriver                        0x000000010eaa19ea chromedriver + 5949930
2   chromedriver                        0x000000010e55a600 chromedriver + 415232
3   chromedriver                        0x000000010e5ac304 chromedriver + 750340
4   chromedriver                        0x000000010e5ac521 chromedriver + 750881
5   chromedriver                        0x000000010e5fc204 chromedriver + 1077764
6   chromedriver                        0x000000010e5d233d chromedriver + 906045
7   chromedriver                        0x000000010e5f9566 chromedriver + 1066342
8   chromedriver                        0x000000010e5d20e3 chromedriver + 905443
9   chromedriver                        0x000000010e59e61d chromedriver + 693789
10  chromedriver  

In [None]:
pip install selenium webdriver-manager pandas

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import time

# Read subject URLs from CSV
subjects_df = pd.read_csv("essex_subject_links_selenium.csv")
subject_urls = subjects_df["Subject URL"].tolist()

# Initialize Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
courses_data = []

def accept_cookies():
    try:
        cookie_accept = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        cookie_accept.click()
        print("Accepted cookies")
    except Exception as e:
        print("No cookie popup found")

def process_subject(subject_url):
    driver.get(subject_url)
    print(f"\nProcessing subject: {subject_url}")
    time.sleep(2)
    
    accept_cookies()
    course_urls = set()
    
    # Tab processing
    for tab_id in ['ug', 'pg', 'pgr']:
        try:
            tab = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, f"a[href='#{tab_id}']"))
            )
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
            driver.execute_script("arguments[0].click();", tab)
            print(f"  Switching to {tab_id.upper()} tab")
            time.sleep(1)
            
            # Load all courses
            while True:
                try:
                    load_more = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "#load-more .button"))
                    )
                    driver.execute_script("arguments[0].click();", load_more)
                    print("    Clicked Load More")
                    time.sleep(1)
                except (TimeoutException, NoSuchElementException):
                    break
            
            # Collect course URLs
            courses = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.course-search-card"))
            )
            course_urls.update([c.get_attribute("href") for c in courses])
            
        except Exception as e:
            print(f"  Couldn't process {tab_id.upper()} tab: {str(e)}")
            continue
    
    return course_urls

def parse_course_info(course_url):
    """Extract course name and degree type from URL"""
    try:
        # Get the last part of the URL
        parts = course_url.rstrip('/').split('/')
        last_segment = parts[-1]
        
        # Split into degree type and course name
        course_parts = last_segment.split('-')
        degree_type = course_parts[0]
        course_name = ' '.join(course_parts[1:]).title()
        
        return course_name, degree_type
    except Exception as e:
        print(f"Error parsing URL: {course_url}")
        return "Unknown", "Unknown"

# Main processing loop
try:
    for subject_url in subject_urls:
        course_urls = process_subject(subject_url)
        print(f"  Found {len(course_urls)} courses in this subject")
        
        for course_url in course_urls:
            course_name, degree_type = parse_course_info(course_url)
            courses_data.append({
                "course_url": course_url,
                "course_name": course_name,
                "degree_type": degree_type
            })
            print(f"    Processed: {degree_type} {course_name}")

finally:
    driver.quit()
    # Save to CSV
    courses_df = pd.DataFrame(courses_data)
    courses_df.drop_duplicates(subset=["course_url"], inplace=True)
    courses_df.to_csv("essex_courses_simplified.csv", index=False)
    print(f"\n✅ Total courses saved: {len(courses_df)}")
    print(courses_df.head())


Processing subject: https://www.essex.ac.uk/subjects/accounting
Accepted cookies
  Switching to UG tab
  Switching to PG tab
  Switching to PGR tab
  Found 13 courses in this subject
    Processed: BSc Accounting
    Processed: MSc Accounting
    Processed: PhD Accounting
    Processed: BSc Accounting And Finance
    Processed: PhD Accounting And Finance
    Processed: PhD Accounting
    Processed: BSc Accounting And Finance
    Processed: Integrated Master In Accounting Accounting And Finance
    Processed: MRes Accounting
    Processed: PhD Accounting And Finance
    Processed: MSc Financial Economics And Accounting
    Processed: MSc Accounting And Finance
    Processed: MSc Accounting And Financial Management

Processing subject: https://www.essex.ac.uk/subjects/acting-producing-stage-management
No cookie popup found
  Switching to UG tab
    Clicked Load More
  Switching to PG tab
    Clicked Load More
  Switching to PGR tab
  Found 21 courses in this subject
    Processed: BA Wo

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (NoSuchElementException, 
                                      TimeoutException, 
                                      InvalidSessionIdException)
import pandas as pd
import time

# Read subject URLs from CSV
subjects_df = pd.read_csv("essex_subject_links_selenium.csv")
subject_urls = subjects_df["Subject URL"].tolist()

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless=new")  # Run in headless mode for stability
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")

def create_driver():
    """Create a new Chrome driver instance"""
    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

# Initialize driver
driver = create_driver()
courses_data = []
MAX_RETRIES = 2

def accept_cookies():
    try:
        cookie_accept = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        cookie_accept.click()
        print("Accepted cookies")
    except Exception as e:
        print("No cookie popup found")

def safe_get(url):
    """Handle page loading with retries"""
    retries = 0
    while retries < 3:
        try:
            driver.get(url)
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, 'body'))
            )
            return True
        except Exception as e:
            print(f"Page load failed, retrying ({retries+1}/3)")
            retries += 1
            time.sleep(2)
    return False

def process_tab(tab_id):
    """Handle individual course level tabs"""
    try:
        tab = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, f"a[href='#{tab_id}']"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", tab)
        driver.execute_script("arguments[0].click();", tab)
        print(f"  Switching to {tab_id.upper()} tab")
        
        # Wait for tab content to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".course-search-card"))
        )
        
        # Handle pagination
        last_count = 0
        while True:
            try:
                load_more = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#load-more .button"))
                )
                driver.execute_script("arguments[0].click();", load_more)
                print("    Clicked Load More")
                
                # Wait for new content to load
                WebDriverWait(driver, 15).until(
                    lambda d: len(d.find_elements(By.CSS_SELECTOR, ".course-search-card")) > last_count
                )
                last_count = len(driver.find_elements(By.CSS_SELECTOR, ".course-search-card"))
                
            except (TimeoutException, NoSuchElementException):
                break

        return [c.get_attribute("href") for c in driver.find_elements(By.CSS_SELECTOR, "a.course-search-card")]
    
    except Exception as e:
        print(f"  Tab {tab_id.upper()} error: {str(e)}")
        return []

def parse_course_info(course_url):
    """Extract course name and degree type from URL"""
    try:
        parts = course_url.rstrip('/').split('/')
        last_segment = parts[-1]
        course_parts = last_segment.split('-')
        return ' '.join(course_parts[1:]).title(), course_parts[0]
    except:
        return "Unknown", "Unknown"

# Main processing loop
for subject_url in subject_urls:
    retries = 0
    success = False
    
    while retries <= MAX_RETRIES and not success:
        try:
            if not safe_get(subject_url):
                raise Exception("Page load failed")
                
            print(f"\nProcessing subject: {subject_url}")
            accept_cookies()
            
            course_urls = set()
            for tab_id in ['ug', 'pg', 'pgr']:
                course_urls.update(process_tab(tab_id))
                
            print(f"  Found {len(course_urls)} courses")
            
            # Process course URLs
            for url in course_urls:
                name, degree = parse_course_info(url)
                courses_data.append({
                    "course_url": url,
                    "course_name": name,
                    "degree_type": degree
                })
            
            success = True
            
        except InvalidSessionIdException:
            print("Session invalid, recreating driver...")
            driver.quit()
            driver = create_driver()
            retries += 1
            
        except Exception as e:
            print(f"Error processing subject: {str(e)}")
            retries += 1
            if retries > MAX_RETRIES:
                print(f"Max retries reached for {subject_url}")
            time.sleep(5)

driver.quit()

# Save results
if courses_data:
    df = pd.DataFrame(courses_data).drop_duplicates(subset=["course_url"])
    df.to_csv("essex_courses_organized.csv", index=False)
    print(f"\n✅ Successfully saved {len(df)} courses")
    print(df.head())
else:
    print("\n❌ No courses found")