In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json

# Base URL of the CBS course catalog
url = "https://kursuskatalog.cbs.dk/search.aspx"
driver = webdriver.Chrome()
driver.get(url)

# Explicit wait for elements to be present/interactable
wait = WebDriverWait(driver, 10)

# Switch to English language (only needs to be done once)
try:
    lang_english_button = wait.until(EC.element_to_be_clickable((By.ID, "LangUk_Link")))
    lang_english_button.click()
    time.sleep(2)  # small delay to allow language switch to take effect
except:
    print("English language link not found or not clickable.")

# Access the dropdown containing study programmes
programme_dropdown_element = wait.until(EC.presence_of_element_located((By.ID, "Level_ProgrammeDropDownList")))
programme_dropdown = Select(programme_dropdown_element)

programme_values = []
programme_labels = []

# Skip options that are headers or have no value
for option in programme_dropdown.options:
    value = option.get_attribute('value')
    class_attr = option.get_attribute('class')
    label = option.text.strip()

    if not value or class_attr == "BoldeSelectValue":
        continue

    programme_values.append(value)
    programme_labels.append(label)

scraped_courses = []
seen_urls = set()  # used to avoid scraping duplicate course pages

# Multi-language support for course info labels (some courses may use Danish or English headers)
label_map = {
    "language": ["Language", "Sprog"],
    "ects": ["Course ECTS", "Kursets ECTS"],
    "type": ["Type", "Type"],
    "level": ["Level", "Niveau"],
    "study_board": ["Study board", "Studienævn"]
}

# Helper function to extract facts from the course information table
def extract_fact_multilang(soup, possible_labels):
    if soup is None:
        return None  # prevent 'NoneType' errors
    for label_text in possible_labels:
        label = soup.find('td', string=lambda x: x and label_text in x)
        if label and label.find_next_sibling('td'):
            return label.find_next_sibling('td').get_text(strip=True)
    return None

# Loop over all study programmes
for i, value in enumerate(programme_values):
    programme_name = programme_labels[i]

    # Re-select dropdown after each iteration because DOM may be refreshed
    programme_dropdown = Select(wait.until(EC.presence_of_element_located((By.ID, "Level_ProgrammeDropDownList"))))
    programme_dropdown.select_by_value(value)

    # Click the search button to list courses under the selected programme
    search_button = wait.until(EC.element_to_be_clickable((By.ID, "SearchButton")))
    search_button.click()
    time.sleep(2)  # allow search results to load

    # Collect course detail page links based on known course ID patterns
    course_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='BA-'], a[href*='KAN-'], a[href*='DIP-'], a[href*='MA-'], a[href*='PHD-'], a[href*='AO-']")
    course_hrefs = [link.get_attribute('href') for link in course_links]

    # Iterate over course links and scrape detailed information
    for href in course_hrefs:
        if href in seen_urls:
            continue  # skip already visited URLs
        seen_urls.add(href)

        driver.get(href)
        time.sleep(1)  # allow course page to load fully
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract course description
        course_description = soup.find('div', id='CourseDescriptionDiv')
        description_text = course_description.get_text(separator="\n", strip=True) if course_description else ""

        # Extract course title from header
        course_title_tag = soup.find("h1", class_="CourseTitle")
        course_title = course_title_tag.get_text(strip=True) if course_title_tag else "Unknown Course Title"

        # Extract structured facts from the 'facts' table
        facts_table = soup.find('table', id='dropCourseFactsHere')
        language = extract_fact_multilang(facts_table, label_map["language"])
        ects = extract_fact_multilang(facts_table, label_map["ects"])
        type_ = extract_fact_multilang(facts_table, label_map["type"])
        level = extract_fact_multilang(facts_table, label_map["level"])
        study_board = extract_fact_multilang(facts_table, label_map["study_board"])

        # Save collected data in structured format
        scraped_courses.append({
            "programme": programme_name,
            "course_title": course_title,
            "url": href,
            "language": language,
            "ects": ects,
            "type": type_,
            "level": level,
            "study_board": study_board,
            "description": description_text
        })

        driver.back()  # go back to the list of courses for this programme
        time.sleep(1)

# Close the browser
driver.quit()

# Save all scraped course data into a JSON file
with open("cbs_courses.json", "w", encoding="utf-8") as f:
    json.dump(scraped_courses, f, ensure_ascii=False, indent=2)

print("Done! Courses have been saved to 'cbs_courses.json'.")


✅ Kész! A kurzusok mentve lettek a 'cbs_courses.json' fájlba.
