In [1]:
# standard libs
import re
import time
import json
import pathlib
from urllib.parse import unquote

# third party
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# folders
BASE_DIR = pathlib.Path().resolve()
DATA_DIR = BASE_DIR / "data"
DATA_DIR.mkdir(exist_ok=True)

# start pages
LISTING_URL = "https://studiegids.vu.nl/en/bachelor/2025-2026#/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

# selenium driver
chrome_options = Options()
# chrome_options.add_argument("--headless=new")  # uncomment for headless runs
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 25)


In [2]:
def q(css, timeout=25):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, css))
    )

def q_all(css, timeout=25):
    WebDriverWait(driver, timeout).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, css))
    )
    return driver.find_elements(By.CSS_SELECTOR, css)

def click_el(el):
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
    time.sleep(0.2)
    el.click()

def try_click(css, timeout=3):
    try:
        el = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, css))
        )
        click_el(el)
        return True
    except Exception:
        return False

def dismiss_cookies():
    def click_buttons():
        xpaths = [
            "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'accept')]",
            "//button[contains(., 'Akkoord')]",
            "//button[contains(., 'Alles accepteren')]",
            "//button[contains(., 'Accept all')]",
            "//button[contains(., 'Accept')]",
        ]
        for xp in xpaths:
            try:
                btn = WebDriverWait(driver, 2).until(
                    EC.element_to_be_clickable((By.XPATH, xp))
                )
                click_el(btn)
                return True
            except Exception:
                pass
        return False

    if click_buttons():
        return
    frames = driver.find_elements(By.CSS_SELECTOR, "iframe")
    for fr in frames:
        try:
            driver.switch_to.frame(fr)
            if click_buttons():
                driver.switch_to.default_content()
                return
        except Exception:
            driver.switch_to.default_content()
        finally:
            driver.switch_to.default_content()


# helper to detect Minor sections
def is_minor_text(txt):
    return bool(re.search(r"\bminor\b", (txt or ""), flags=re.I))

def safe_text(el, sel=".accordion-title"):
    try:
        t = el.find_element(By.CSS_SELECTOR, sel).text.strip()
        if t:
            return t
    except Exception:
        pass
    try:
        return el.text.strip()
    except Exception:
        return ""



In [3]:
def open_listing_and_filter():
    driver.get(LISTING_URL)
    dismiss_cookies()
    # wait for either dropdowns or results
    WebDriverWait(driver, 30).until(
        lambda d: d.find_elements(By.CSS_SELECTOR, "div.sg-dropdown-title")
        or d.find_elements(By.CSS_SELECTOR, ".sg-search-result")
    )
    # language
    xp_lang = "//div[contains(@class,'sg-dropdown-title')][.//span[contains(., 'Language')]]"
    click_el(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, xp_lang))))
    if not try_click("#LanguageEN0 + label", timeout=2):
        # fallback by label text
        lab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//label[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'english')]"))
        )
        click_el(lab)
    # faculty
    xp_fac = "//div[contains(@class,'sg-dropdown-title')][.//span[contains(., 'Faculty')]]"
    click_el(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, xp_fac))))
    for fac in ["School of Business and Economics", "Faculty of Science", "Faculty of Humanities"]:
        lab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, f"//label[contains(., '{fac}')]"))
        )
        click_el(lab)
    time.sleep(1.0)

def collect_programmes_two_pages():
    items = []
    seen = set()

    def read_cards():
        cards = driver.find_elements(By.CSS_SELECTOR, ".sg-search-result")
        out = []
        for c in cards:
            try:
                title_el = c.find_element(By.CSS_SELECTOR, ".sg-mb-1")
                title = title_el.text.strip()
                try:
                    a = title_el.find_element(By.CSS_SELECTOR, "a[href]")
                except Exception:
                    a = c.find_element(By.CSS_SELECTOR, "a[href]")
                href = a.get_attribute("href")
                if href and href not in seen:
                    out.append({"title": title, "url": href})
                    seen.add(href)
            except Exception:
                continue
        return out

    # page one
    items.extend(read_cards())

    # scroll to reveal paginator
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(0.8)

    # click page 2 inside the exact paginator container selector you gave
    try:
        nav = driver.find_element(
            By.CSS_SELECTOR,
            "body > div > div > div.grid-container > div > div > div > div.cell.small-12.large-8.xlarge-9 > div.sg-mt-2.sg-mt-m-3.sg-mt-l-8 > nav.sg-pagination.sg-mt-7.sg-mt-m-6.show-for-medium"
        )
        # prefer visible link with text 2
        candidates = []
        candidates += nav.find_elements(By.XPATH, ".//a[normalize-space()='2']")
        candidates += nav.find_elements(By.XPATH, ".//button[normalize-space()='2']")
        candidates += nav.find_elements(By.CSS_SELECTOR, "a[aria-label*='2'], button[aria-label*='2']")
        clicked = False
        for el in candidates:
            try:
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                time.sleep(0.2)
                el.click()
                time.sleep(1.2)
                clicked = True
                break
            except Exception:
                continue
        if not clicked:
            # fallback next
            for sel in ["a[aria-label*='Next']", "button[aria-label*='Next']",
                        "li.sg-pagination__next a", "button.sg-pagination__next"]:
                try:
                    el = nav.find_element(By.CSS_SELECTOR, sel)
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                    time.sleep(0.2)
                    el.click()
                    time.sleep(1.2)
                    clicked = True
                    break
                except Exception:
                    continue
        if clicked:
            items.extend(read_cards())
    except Exception:
        pass

    return items

open_listing_and_filter()
programmes = collect_programmes_two_pages()
print("Programmes found:", len(programmes))
[itm["title"] for itm in programmes]


Programmes found: 17


['Ancient Studies',
 'Archaeology',
 'Artificial Intelligence',
 'Biomedical Sciences',
 'Business Analytics',
 'Communication and Information Studies',
 'Computer Science',
 'Econometrics and Data Science',
 'Econometrics and Operations Research',
 'Economics and Business Economics',
 'History',
 'International Business Administration',
 'Literature and Society',
 'Mathematics',
 'Media, Art, Design and Architecture',
 'Philosophy',
 'Philosophy, Politics and Economics']

In [4]:
def open_tab_three(url):
    base = url.split("#/tab=")[0]
    driver.get(f"{base}#/tab=3")
    dismiss_cookies()
    time.sleep(0.8)

def list_tracks():
    items = driver.find_elements(By.CSS_SELECTOR, "#study-program .accordion > div")
    out = []
    for tr in items:
        label = safe_text(tr)
        if is_minor_text(label):
            continue
        out.append(tr)
    return out

def expand_if_collapsed(container):
    try:
        content = container.find_element(By.CSS_SELECTOR, ".accordion-content")
    except Exception:
        content = None

    if content and content.is_displayed():
        return

    for sel in ["button", ".accordion-title"]:
        try:
            btn = container.find_element(By.CSS_SELECTOR, sel)
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
            time.sleep(0.2)
            btn.click()
            time.sleep(0.8)
            break
        except Exception:
            continue


def list_year_items(track_container):
    try:
        items = track_container.find_elements(By.CSS_SELECTOR, ".accordion .accordion-item, .accordion > div")
    except Exception:
        return []
    out = []
    for it in items:
        yl = safe_text(it)
        if is_minor_text(yl):
            continue
        out.append(it)
    return out

def parse_visible_tables(scope_container):
    # read all visible tables under the given container
    out = []
    contents = scope_container.find_elements(By.CSS_SELECTOR, ".accordion-content")
    vis = [c for c in contents if c.is_displayed()]
    if not vis and contents:
        vis = [contents[0]]

    for cont in vis:
        tables = cont.find_elements(By.CSS_SELECTOR, "table tbody")
        for tb in tables:
            if not tb.is_displayed():
                continue
            rows = tb.find_elements(By.CSS_SELECTOR, "tr")
            for r in rows:
                tds = r.find_elements(By.CSS_SELECTOR, "td")
                if not tds:
                    continue
                # name
                try:
                    name = tds[0].find_element(By.CSS_SELECTOR, "a").text.strip()
                except Exception:
                    name = tds[0].text.strip()
                # period
                per = None
                if len(tds) > 1:
                    try:
                        per_text = tds[1].find_element(By.CSS_SELECTOR, "a").text.strip()
                    except Exception:
                        per_text = tds[1].text.strip()
                    m = re.search(r"(\d+)", per_text)
                    per = int(m.group(1)) if m else None
                # ects
                ects = None
                if len(tds) > 2:
                    try:
                        ects_text = tds[2].find_element(By.CSS_SELECTOR, "a").text.strip()
                    except Exception:
                        ects_text = tds[2].text.strip()
                    m = re.search(r"(\d+)", ects_text)
                    ects = int(m.group(1)) if m else None
                # code
                code = ""
                if len(tds) > 3:
                    try:
                        code = tds[3].find_element(By.CSS_SELECTOR, "a").text.strip()
                    except Exception:
                        code = tds[3].text.strip()

                if name:
                    out.append({"course_name": name, "period": per, "ects": ects, "code": code})
    return out

def parse_studiegids_all_tracks_years(url):
    open_tab_three(url)

    out = []
    visited = set()  # guards against double parsing if DOM reflows

    tracks = list_tracks()
    for tr in tracks:
        tr_label = safe_text(tr)
        if is_minor_text(tr_label):
            continue

        expand_if_collapsed(tr)

        year_items = list_year_items(tr)
        if not year_items:
            # table directly under track
            rows = parse_visible_tables(tr)
            for row in rows:
                key = (tr_label, row.get("course_name"), row.get("code"))
                if key in visited:
                    continue
                visited.add(key)
                row["track"] = tr_label
                row["year_label"] = ""
                out.append(row)
            continue

        for yi in year_items:
            yl = safe_text(yi)
            if is_minor_text(yl):
                continue

            expand_if_collapsed(yi)
            rows = parse_visible_tables(yi)
            for row in rows:
                key = (tr_label, yl, row.get("course_name"), row.get("code"))
                if key in visited:
                    continue
                visited.add(key)
                row["track"] = tr_label
                row["year_label"] = yl
                out.append(row)
    return out



In [5]:
CACHE_DIR = DATA_DIR / "vu_cache"
CACHE_DIR.mkdir(exist_ok=True)

def cache_path(url):
    slug = re.sub(r"[^a-zA-Z0-9]+", "_", url.strip("/"))[:180]
    return CACHE_DIR / f"{slug}.html"

def static_soup(url):
    fp = cache_path(url)
    if fp.exists():
        html = fp.read_text(encoding="utf-8", errors="ignore")
        return BeautifulSoup(html, "lxml")
    r = requests.get(url, headers=HEADERS, timeout=25)
    r.raise_for_status()
    fp.write_text(r.text, encoding="utf-8")
    return BeautifulSoup(r.text, "lxml")

def pick_vu_base(info_links):
    for u in info_links:
        if not u:
            continue
        if "vu.nl" in u and "/education/bachelor/" in u:
            base = u.split("?")[0].rstrip("/")
            for tail in ["/curriculum", "/future", "/admissions", "/careers"]:
                if base.endswith(tail):
                    base = base[: -len(tail)]
            return base
    return None

def parse_vu_base(base_url):
    soup = static_soup(base_url)
    desc = ""
    el = soup.select_one(".xlarge-offset-0")
    if el:
        desc = el.get_text(strip=True)
    return {"vunl_description": desc}

def parse_vu_curriculum(base_url):
    url = base_url.rstrip("/") + "/curriculum"
    soup = static_soup(url)

    desc = ""
    el = soup.select_one(".xlarge-offset-0")
    if el:
        desc = el.get_text(strip=True)

    year_blocks = [b.get_text(strip=True) for b in soup.select(".xxlarge-6")] or []

    subjects = []
    for card in soup.select(".vuw-card-border-left"):
        text = card.get_text("\n", strip=True)
        name = text.split("\n")[0].strip()
        if is_minor_text(name):
            continue
        ects = None
        period = None
        year = None
        m = re.search(r"(\d+)\s*ECTS", text, flags=re.I)
        if m:
            ects = int(m.group(1))
        m = re.search(r"Period\s*([0-9]+)", text, flags=re.I)
        if m:
            period = int(m.group(1))
        m = re.search(r"Year\s*([0-9]+)", text, flags=re.I)
        if m:
            year = int(m.group(1))
        subjects.append({"course_name": name, "ects": ects, "period": period, "year": year})

    return {
        "vunl_description_curriculum": desc,
        "vunl_firstyear_description_blocks": year_blocks,
        "vunl_subjects": subjects
    }

def parse_vu_future(base_url):
    url = base_url.rstrip("/") + "/future"
    soup = static_soup(url)
    desc = ""
    el = soup.select_one(".xlarge-offset-0")
    if el:
        desc = el.get_text(strip=True)
    career = ""
    el = soup.select_one(".large-6 + .large-6 .vuw-p-6")
    if el:
        career = el.get_text(" ", strip=True)
    return {"vunl_future_description": desc, "vunl_future_career": career}

def parse_vu_admissions(base_url):
    url = base_url.rstrip("/") + "/admissions"
    soup = static_soup(url)
    rich_texts = [el.get_text(" ", strip=True) for el in soup.select(".vuw-rich-text")]
    dutch = ""
    for t in rich_texts:
        if "Dutch" in t or "VWO" in t or "Dutch diploma" in t:
            dutch = t
            break
    return {"vunl_admission_dutch_diploma": dutch}


In [6]:
# generate a smaller sample of the data
programmes = programmes[:2]  # limit to first 2 for testing

## 6. Main pipeline, all tracks and years, plus vu.nl enrichment

In [None]:
rows_prog = []
rows_subj = []

for item in tqdm(programmes, desc="Programmes"):
    url = item["url"]
    title = item["title"]

    prog = {"programme_title": title, "programme_url": url}

    # studiegids tab two for description and info links
    base = url.split("#/tab=")[0]
    driver.get(f"{base}#/tab=2")
    dismiss_cookies()
    time.sleep(0.8)
    try:
        prog["sg_description"] = driver.find_element(By.CSS_SELECTOR, "#study-description").text.strip()
    except Exception:
        prog["sg_description"] = ""
    info_links = []
    try:
        info_block = driver.find_element(By.CSS_SELECTOR, ".info")
        anchors = info_block.find_elements(By.CSS_SELECTOR, "a[href]")
        info_links = [a.get_attribute("href") for a in anchors]
    except Exception:
        pass
    prog["info_links"] = info_links

    # subjects from all tracks and years
    all_rows = parse_studiegids_all_tracks_years(url)
    for r in all_rows:
        rows_subj.append({
            "programme_title": title,
            "programme_url": url,
            "track": r.get("track", ""),
            "year_label": r.get("year_label", ""),
            "course_name": r.get("course_name"),
            "period": r.get("period"),
            "ects": r.get("ects"),
            "code": r.get("code")
        })

    # vu.nl enrichment
    vu_base = pick_vu_base(info_links)
    prog["vunl_base_url"] = vu_base
    if vu_base:
        prog.update(parse_vu_base(vu_base))
        cur = parse_vu_curriculum(vu_base)
        prog.update({k: v for k, v in cur.items() if k != "vunl_subjects"})
        fut = parse_vu_future(vu_base)
        prog.update(fut)
        prog.update(parse_vu_admissions(vu_base))

    rows_prog.append(prog)

df_prog = pd.DataFrame(rows_prog).drop_duplicates(subset=["programme_url"]).reset_index(drop=True)
df_subj = pd.DataFrame(rows_subj).drop_duplicates().reset_index(drop=True)

print("Programmes shape", df_prog.shape)
print("Subjects shape", df_subj.shape)

df_prog.head(2), df_subj.head(10)


Programmes:   0%|          | 0/2 [00:00<?, ?it/s]