# Scraping of Programmes and Courses Data

Notebook pipeline that

1) Loads both pages in studiegids and collects all programme URLs
2) Clicks through every programme, then every track, then every year, and reads the visible tables
3) Enriches each programme with text from vu.nl base, curriculum, future, and admissions
4) Saves tidy CSV and JSON for programmes and subjects 

## 1. Imports and setup
Sets folders, constants, and starts Selenium Chrome.


In [1]:
# standard libs
import re
import time
import json
import pathlib
from urllib.parse import unquote
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

# selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [2]:

# folders
BASE_DIR = pathlib.Path().resolve()
DATA_DIR = BASE_DIR / "data"
DATA_DIR.mkdir(exist_ok=True)

# start pages
LISTING_URL = "https://studiegids.vu.nl/en/bachelor/2025-2026#/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

# selenium driver
chrome_options = Options()
# chrome_options.add_argument("--headless=new")  # uncomment for headless runs
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 25) #in seconds


## 2. Selenium helpers
Utility functions for waiting, clicking, and dismissing cookies.

In [3]:
def q(css, timeout=25):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, css))
    )

def q_all(css, timeout=25):
    WebDriverWait(driver, timeout).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, css))
    )
    return driver.find_elements(By.CSS_SELECTOR, css)

def click_el(el):
    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
    time.sleep(0.2)
    el.click()

def try_click(css, timeout=3):
    try:
        el = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, css))
        )
        click_el(el)
        return True
    except Exception:
        return False

def dismiss_cookies():
    def click_buttons():
        xpaths = [
            "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'accept')]",
            "//button[contains(., 'Akkoord')]",
            "//button[contains(., 'Alles accepteren')]",
            "//button[contains(., 'Accept all')]",
            "//button[contains(., 'Accept')]",
        ]
        for xp in xpaths:
            try:
                btn = WebDriverWait(driver, 2).until(
                    EC.element_to_be_clickable((By.XPATH, xp))
                )
                click_el(btn)
                return True
            except Exception:
                pass
        return False

    if click_buttons():
        return
    frames = driver.find_elements(By.CSS_SELECTOR, "iframe")
    for fr in frames:
        try:
            driver.switch_to.frame(fr)
            if click_buttons():
                driver.switch_to.default_content()
                return
        except Exception:
            driver.switch_to.default_content()
        finally:
            driver.switch_to.default_content()


# helper to detect Minor sections
def is_minor_text(txt):
    return bool(re.search(r"\bminor\b", (txt or ""), flags=re.I))

def safe_text(el, sel=".accordion-title"):
    try:
        t = el.find_element(By.CSS_SELECTOR, sel).text.strip()
        if t:
            return t
    except Exception:
        pass
    try:
        return el.text.strip()
    except Exception:
        return ""



## 3. Listing filters and pagination

Opens the bachelor listing, applies English and faculty filters. Then it scrolls, then clicks page two inside the exact paginator.
Collects programme titles and URLs across both pages.

In [3]:
# choose the faculties to include
faculties_to_include = ["School of Business and Economics", "Faculty of Science", "Faculty of Humanities"]


def open_listing_and_filter():
    driver.get(LISTING_URL)
    dismiss_cookies()
    # wait for either dropdowns or results
    WebDriverWait(driver, 30).until(
        lambda d: d.find_elements(By.CSS_SELECTOR, "div.sg-dropdown-title")
        or d.find_elements(By.CSS_SELECTOR, ".sg-search-result")
    )
    # language of the courses
    xp_lang = "//div[contains(@class,'sg-dropdown-title')][.//span[contains(., 'Language')]]"
    click_el(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, xp_lang))))
    if not try_click("#LanguageEN0 + label", timeout=2):
        # fallback by label text
        lab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//label[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), 'english')]"))
        )
        click_el(lab)
    # faculty
    xp_fac = "//div[contains(@class,'sg-dropdown-title')][.//span[contains(., 'Faculty')]]"
    click_el(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, xp_fac))))
    for fac in faculties_to_include:
        lab = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, f"//label[contains(., '{fac}')]"))
        )
        click_el(lab)
    time.sleep(1.0)

def collect_programmes_two_pages():
    items = []
    seen = set()

    def read_cards():
        cards = driver.find_elements(By.CSS_SELECTOR, ".sg-search-result")
        out = []
        for c in cards:
            try:
                title_el = c.find_element(By.CSS_SELECTOR, ".sg-mb-1")
                title = title_el.text.strip()
                try:
                    a = title_el.find_element(By.CSS_SELECTOR, "a[href]")
                except Exception:
                    a = c.find_element(By.CSS_SELECTOR, "a[href]")
                href = a.get_attribute("href")
                if href and href not in seen:
                    out.append({"title": title, "url": href})
                    seen.add(href)
            except Exception:
                continue
        return out

    # page one
    items.extend(read_cards())

    # scroll to reveal paginator
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.0)

    # click page 2 inside the exact paginator container selector you gave
    try:
        nav = driver.find_element(
            By.CSS_SELECTOR,
            "body > div > div > div.grid-container > div > div > div > div.cell.small-12.large-8.xlarge-9 > div.sg-mt-2.sg-mt-m-3.sg-mt-l-8 > nav.sg-pagination.sg-mt-7.sg-mt-m-6.show-for-medium"
        )
        # prefer visible link with text 2
        candidates = []
        candidates += nav.find_elements(By.XPATH, ".//a[normalize-space()='2']")
        candidates += nav.find_elements(By.XPATH, ".//button[normalize-space()='2']")
        candidates += nav.find_elements(By.CSS_SELECTOR, "a[aria-label*='2'], button[aria-label*='2']")
        clicked = False
        for el in candidates:
            try:
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                time.sleep(0.2)
                el.click()
                time.sleep(1.2)
                clicked = True
                break
            except Exception:
                continue
        if not clicked:
            # fallback next
            for sel in ["a[aria-label*='Next']", "button[aria-label*='Next']",
                        "li.sg-pagination__next a", "button.sg-pagination__next"]:
                try:
                    el = nav.find_element(By.CSS_SELECTOR, sel)
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
                    time.sleep(0.2)
                    el.click()
                    time.sleep(1.2)
                    clicked = True
                    break
                except Exception:
                    continue
        if clicked:
            items.extend(read_cards())
    except Exception:
        pass

    return items

open_listing_and_filter()
programmes = collect_programmes_two_pages()
print("Programmes found:", len(programmes))
[itm["title"] for itm in programmes]


Programmes found: 17


['Ancient Studies',
 'Archaeology',
 'Artificial Intelligence',
 'Biomedical Sciences',
 'Business Analytics',
 'Communication and Information Studies',
 'Computer Science',
 'Econometrics and Data Science',
 'Econometrics and Operations Research',
 'Economics and Business Economics',
 'History',
 'International Business Administration',
 'Literature and Society',
 'Mathematics',
 'Media, Art, Design and Architecture',
 'Philosophy',
 'Philosophy, Politics and Economics']

## 4. Studiegids parsers all tracks and years

Opens the curriculum tab, expands every track, then every year.
Skips sections labeled Minor and reads only visible tables.
Extracts course name, period, ECTS, code, track, and year label.

In [4]:
def open_tab_three(url):
    base = url.split("#/tab=")[0]
    driver.get(f"{base}#/tab=3")
    dismiss_cookies()
    time.sleep(0.8)

def list_tracks():
    items = driver.find_elements(By.CSS_SELECTOR, "#study-program .accordion > div")
    out = []
    for tr in items:
        label = safe_text(tr)
        if is_minor_text(label):
            continue
        out.append(tr)
    return out

def expand_if_collapsed(container):
    try:
        content = container.find_element(By.CSS_SELECTOR, ".accordion-content")
    except Exception:
        content = None

    if content and content.is_displayed():
        return

    for sel in ["button", ".accordion-title"]:
        try:
            btn = container.find_element(By.CSS_SELECTOR, sel)
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
            time.sleep(0.2)
            btn.click()
            time.sleep(0.8)
            break
        except Exception:
            continue


def list_year_items(track_container):
    try:
        items = track_container.find_elements(By.CSS_SELECTOR, ".accordion .accordion-item, .accordion > div")
    except Exception:
        return []
    out = []
    for it in items:
        yl = safe_text(it)
        if is_minor_text(yl):
            continue
        out.append(it)
    return out

def parse_visible_tables(scope_container):
    # read all visible tables under the given container
    out = []
    contents = scope_container.find_elements(By.CSS_SELECTOR, ".accordion-content")
    vis = [c for c in contents if c.is_displayed()]
    if not vis and contents:
        vis = [contents[0]]

    for cont in vis:
        tables = cont.find_elements(By.CSS_SELECTOR, "table tbody")
        for tb in tables:
            if not tb.is_displayed():
                continue
            rows = tb.find_elements(By.CSS_SELECTOR, "tr")
            for r in rows:
                tds = r.find_elements(By.CSS_SELECTOR, "td")
                if not tds:
                    continue
                # name
                try:
                    name = tds[0].find_element(By.CSS_SELECTOR, "a").text.strip()
                except Exception:
                    name = tds[0].text.strip()
                # period
                per = None
                if len(tds) > 1:
                    try:
                        per_text = tds[1].find_element(By.CSS_SELECTOR, "a").text.strip()
                    except Exception:
                        per_text = tds[1].text.strip()
                    m = re.search(r"(\d+)", per_text)
                    per = int(m.group(1)) if m else None
                # ects
                ects = None
                if len(tds) > 2:
                    try:
                        ects_text = tds[2].find_element(By.CSS_SELECTOR, "a").text.strip()
                    except Exception:
                        ects_text = tds[2].text.strip()
                    m = re.search(r"(\d+)", ects_text)
                    ects = int(m.group(1)) if m else None
                # code
                code = ""
                if len(tds) > 3:
                    try:
                        code = tds[3].find_element(By.CSS_SELECTOR, "a").text.strip()
                    except Exception:
                        code = tds[3].text.strip()

                if name:
                    out.append({"course_name": name, "period": per, "ects": ects, "code": code})
    return out

def parse_studiegids_all_tracks_years(url):
    open_tab_three(url)

    out = []
    visited = set()  # guards against double parsing if DOM reflows

    tracks = list_tracks()
    for tr in tracks:
        tr_label = safe_text(tr)
        if is_minor_text(tr_label):
            continue

        expand_if_collapsed(tr)

        year_items = list_year_items(tr)
        if not year_items:
            # table directly under track
            rows = parse_visible_tables(tr)
            for row in rows:
                key = (tr_label, row.get("course_name"), row.get("code"))
                if key in visited:
                    continue
                visited.add(key)
                row["track"] = tr_label
                row["year_label"] = ""
                out.append(row)
            continue

        for yi in year_items:
            yl = safe_text(yi)
            if is_minor_text(yl):
                continue

            expand_if_collapsed(yi)
            rows = parse_visible_tables(yi)
            for row in rows:
                key = (tr_label, yl, row.get("course_name"), row.get("code"))
                if key in visited:
                    continue
                visited.add(key)
                row["track"] = tr_label
                row["year_label"] = yl
                out.append(row)
    return out



## 5. vu.nl helpers with cache

Finds the base vu.nl URL from studiegids info links.
Then it fetches base, curriculum, future, and admissions pages with caching.
Parses descriptions and optional curriculum cards as fallback.

In [5]:
CACHE_DIR = DATA_DIR / "vu_cache"
CACHE_DIR.mkdir(exist_ok=True)

def cache_path(url):
    slug = re.sub(r"[^a-zA-Z0-9]+", "_", url.strip("/"))[:180]
    return CACHE_DIR / f"{slug}.html"

#def static_soup(url):
    fp = cache_path(url)
    if fp.exists():
        html = fp.read_text(encoding="utf-8", errors="ignore")
        return BeautifulSoup(html, "lxml")
    r = requests.get(url, headers=HEADERS, timeout=25)
    r.raise_for_status()
    fp.write_text(r.text, encoding="utf-8")
    return BeautifulSoup(r.text, "lxml")

def static_soup(url):
    fp = cache_path(url)
    if fp.exists():
        html = fp.read_text(encoding="utf-8", errors="ignore")
        return BeautifulSoup(html, "lxml")
    r = requests.get(url, headers=HEADERS, timeout=25)
    r.raise_for_status()
    fp.write_text(r.text, encoding="utf-8")
    return BeautifulSoup(r.text, "lxml")

def pick_vu_base(info_links):
    for u in info_links:
        if not u:
            continue
        if "vu.nl" in u and "/education/bachelor/" in u:
            base = u.split("?")[0].rstrip("/")
            for tail in ["/curriculum", "/future", "/admissions", "/careers"]:
                if base.endswith(tail):
                    base = base[: -len(tail)]
            return base
    return None

def parse_vu_base(base_url):
    soup = static_soup(base_url)
    if soup is None:
        return {"vunl_description": ""}
    el = soup.select_one(".xlarge-offset-0")
    desc = el.get_text(strip=True) if el else ""
    return {"vunl_description": desc}

def parse_vu_curriculum(base_url):
    url, soup = discover_section_url(base_url, "curriculum")
    if soup is None:
        return {"vunl_description_curriculum": "", "vunl_firstyear_description_blocks": [], "vunl_subjects": []}

    desc_el = soup.select_one(".xlarge-offset-0")
    desc = desc_el.get_text(strip=True) if desc_el else ""
    year_blocks = [b.get_text(strip=True) for b in soup.select(".xxlarge-6")] or []

    subjects = []
    for card in soup.select(".vuw-card-border-left"):
        text = card.get_text("\n", strip=True)
        name = text.split("\n")[0].strip()
        # skip minors
        if re.search(r"\bminor\b", name, flags=re.I):
            continue
        ects = None
        period = None
        year = None
        m = re.search(r"(\d+)\s*ECTS", text, flags=re.I)
        if m:
            ects = int(m.group(1))
        m = re.search(r"Period\s*([0-9]+)", text, flags=re.I)
        if m:
            period = int(m.group(1))
        m = re.search(r"Year\s*([0-9]+)", text, flags=re.I)
        if m:
            year = int(m.group(1))
        subjects.append({"course_name": name, "ects": ects, "period": period, "year": year})

    return {
        "vunl_description_curriculum": desc,
        "vunl_firstyear_description_blocks": year_blocks,
        "vunl_subjects": subjects
    }

def parse_vu_future(base_url):
    url, soup = discover_section_url(base_url, "future")
    if soup is None:
        return {"vunl_future_description": "", "vunl_future_career": ""}

    desc_el = soup.select_one(".xlarge-offset-0")
    desc = desc_el.get_text(strip=True) if desc_el else ""

    career_el = soup.select_one(".large-6 + .large-6 .vuw-p-6")
    career = career_el.get_text(" ", strip=True) if career_el else ""
    return {"vunl_future_description": desc, "vunl_future_career": career}

def parse_vu_admissions(base_url):
    url, soup = discover_section_url(base_url, "admissions")
    if soup is None:
        return {"vunl_admission_dutch_diploma": ""}

    rich = [el.get_text(" ", strip=True) for el in soup.select(".vuw-rich-text")]
    dutch = ""
    for t in rich:
        if "Dutch" in t or "VWO" in t or "Dutch diploma" in t:
            dutch = t
            break
    return {"vunl_admission_dutch_diploma": dutch}


# find the best matching section url on the base page
def discover_section_url(base_url, want):
    """
    want can be 'curriculum', 'future', or 'admissions'
    Try common slugs. If those 404, scan the base page for anchors
    whose text or href matches the idea.
    """
    slug_map = {
        "curriculum": ["curriculum", "study-programme", "programme", "program"],
        "future": ["future", "your-future-career", "career"],
        "admissions": ["admissions", "admission", "how-to-apply", "apply"]
    }
    # try common slugs first
    for slug in slug_map.get(want, []):
        url = base_url.rstrip("/") + "/" + slug
        soup = static_soup(url)
        if soup is not None:
            return url, soup

    # fall back to scanning anchors on the base page
    base_soup = static_soup(base_url)
    if base_soup is None:
        return None, None

    want_words = {
        "curriculum": ["curriculum", "study programme", "courses"],
        "future": ["future", "career", "after graduation"],
        "admissions": ["admissions", "admission", "apply"]
    }[want]

    for a in base_soup.select("a[href]"):
        txt = a.get_text(" ", strip=True).lower()
        href = a.get("href", "")
        href_l = href.lower()
        if any(w in txt for w in want_words) or any(w in href_l for w in want_words):
            # build absolute url if needed
            if href.startswith("http"):
                u = href
            else:
                # base_url is absolute, join relative path
                u = base_url.rstrip("/") + "/" + href.lstrip("/")
            soup = static_soup(u)
            if soup is not None:
                return u, soup

    return None, None





In [6]:
# generate a smaller sample of the data
#programmes = programmes[12:13]  # limit to first 2 for testing

# generate a smaller sample of the data
TEST_TITLES = {
    "Biomedical Sciences"
    #,
#    "Philosophy, Politics and Economics",
 #   "Computer Science"
}
# filter programmes to include only those in TEST_TITLES
programmes = [p for p in programmes if p["title"] in TEST_TITLES]

## 6. Main pipeline scrape and enrich

Loops over programmes, captures studiegids description and info links.
Then, it builds tidy rows for programmes and subjects.

In [19]:
rows_prog = []
rows_subj = []

for item in tqdm(programmes, desc="Programmes"):
    url = item["url"]
    title = item["title"]

    prog = {"programme_title": title, "programme_url": url}

    # studiegids tab two for description and info links
    base = url.split("#/tab=")[0]
    driver.get(f"{base}#/tab=2")
    dismiss_cookies()
    time.sleep(0.8)
    try:
        prog["sg_description"] = driver.find_element(By.CSS_SELECTOR, "#study-description").text.strip()
    except Exception:
        prog["sg_description"] = ""
    info_links = []
    try:
        info_block = driver.find_element(By.CSS_SELECTOR, ".info")
        anchors = info_block.find_elements(By.CSS_SELECTOR, "a[href]")
        info_links = [a.get_attribute("href") for a in anchors]
    except Exception:
        pass
    prog["info_links"] = info_links

    # subjects from all tracks and years
    all_rows = parse_studiegids_all_tracks_years(url)
    for r in all_rows:
        rows_subj.append({
            "programme_title": title,
            "programme_url": url,
            "track": r.get("track", ""),
            "year_label": r.get("year_label", ""),
            "course_name": r.get("course_name"),
            "period": r.get("period"),
            "ects": r.get("ects"),
            "code": r.get("code")
        })

    # vu.nl enrichment
    vu_base = pick_vu_base(info_links)
    prog["vunl_base_url"] = vu_base
    if vu_base:
        try:
            prog.update(parse_vu_base(vu_base))
        except Exception:
            prog["vunl_description"] = ""
        try:
            cur = parse_vu_curriculum(vu_base)
            prog.update({k: v for k, v in cur.items() if k != "vunl_subjects"})
        except Exception:
            prog["vunl_description_curriculum"] = ""
            prog["vunl_firstyear_description_blocks"] = []
        try:
            fut = parse_vu_future(vu_base)
            prog.update(fut)
        except Exception:
            prog["vunl_future_description"] = ""
            prog["vunl_future_career"] = ""
        try:
            prog.update(parse_vu_admissions(vu_base))
        except Exception:
            prog["vunl_admission_dutch_diploma"] = ""

    rows_prog.append(prog)

df_prog = pd.DataFrame(rows_prog).drop_duplicates(subset=["programme_url"]).reset_index(drop=True)
df_subj = pd.DataFrame(rows_subj).drop_duplicates().reset_index(drop=True)

print("Programmes shape", df_prog.shape)
print("Subjects shape", df_subj.shape)

df_prog.head(2), df_subj.head(10)

#time: 44.50 min


Programmes: 100%|██████████| 1/1 [01:13<00:00, 73.52s/it]

Programmes shape (1, 11)
Subjects shape (74, 8)





(       programme_title                                      programme_url  \
 0  Biomedical Sciences  https://studiegids.vu.nl/en/Bachelor/2025-2026...   
 
                                       sg_description  \
 0  The bachelor's programme Biomedical Sciences e...   
 
                                           info_links  \
 0  [https://science.vu.nl/en/index.aspx, https://...   
 
                                        vunl_base_url  \
 0  https://vu.nl/en/education/bachelor/biomedical...   
 
                                     vunl_description  \
 0  How does your body convert food into energy? W...   
 
                          vunl_description_curriculum  \
 0  How can you combat viruses? What happens in th...   
 
   vunl_firstyear_description_blocks  \
 0                                []   
 
                              vunl_future_description  \
 0  As a graduate of Biomedical Sciences, you can ...   
 
                                   vunl_future_career  \
 0  Sta

In [20]:
# save programmes with pdf info
df_prog.to_csv(DATA_DIR / "df_programmes_bronze.csv", index=False, encoding="utf-8-sig")
df_prog.to_json(DATA_DIR / "df_programmes_bronze.json", orient="records", force_ascii=False, indent=2)
df_subj.to_csv(DATA_DIR / "df_subj_temp_bronze.csv", index=False, encoding="utf-8-sig")

## 7. Clean df_subj 
Deduplicate and generates other columns: Year and Track

In [5]:
# drop rows where the track column contains the word Year
#mask_keep = ~df_subj["track"].fillna("").str.contains(r"\byear\b", case=False, regex=True)
#df_subj = df_subj[mask_keep].copy()

# join text columns track and year_label into parsed columns called track1
df_subj['track1'] = df_subj["track"].fillna("") + " " + df_subj["year_label"].fillna("")


In [6]:

# parser that extracts Track name and Year number from year_label
def parse_track_year(label):
    # expected shape like: Bachelor Ancient Studies, Track Ancient Studies Year 1
    # or: Bachelor Ancient Studies, Specialization Archaeology Year 3
    if not isinstance(label, str) or not label.strip():
        return pd.Series([None, None])
    m = re.search(r"(?:Track|Specialization)\s+(.+?)\s+Year\s+(\d+)", label, flags=re.I)
    if m:
        track_name = m.group(1).strip()
        year_num = int(m.group(2))
        return pd.Series([track_name, year_num])
    # fallback: find the part before Year and strip a leading Track or Specialization
    m2 = re.search(r"Year\s+(\d+)", label, flags=re.I)
    if m2:
        year_num = int(m2.group(1))
        before = label[:m2.start()]
        tail = before.split(",")[-1].strip()
        tail = re.sub(r"^(Track|Specialization)\s+", "", tail, flags=re.I).strip()
        track_name = tail if tail else None
        return pd.Series([track_name, year_num])
    return pd.Series([None, None])

df_subj[["track_from_label", "year_num"]] = df_subj["track1"].apply(parse_track_year)

# optional: fill missing track_from_label with the existing track text
df_subj["track_from_label"] = df_subj["track_from_label"].fillna(df_subj["track"])

# ensure numeric types where possible
df_subj["year_num"] = pd.to_numeric(df_subj["year_num"], errors="coerce").astype("Int64")

# quick check
df_subj[["programme_title", "track", "year_label", "track_from_label", "year_num"]].head(8)

# remove columns no longer needed
df_subj = df_subj.drop(columns=["track", "year_label", "track1", "track_from_label"])



## 8. Download programme PDFs from info_links into data/bachelor_programs_pdfs

In [7]:
# download first pdf link from info_links for each programme
import re, ast, requests
from urllib.parse import urlparse
PDF_DIR = DATA_DIR / "bachelor_programs_pdfs"
PDF_DIR.mkdir(exist_ok=True)

def sanitize_filename(s):
    # remove characters that are unsafe for filenames
    return re.sub(r'[\\/*?:"<>|]', "_", s).strip()

def first_pdf_url(links):
    # links can be a list or a string representation of a list
    if links is None:
        return None
    if isinstance(links, str):
        # try parse as a list representation
        try:
            maybe_list = ast.literal_eval(links)
            if isinstance(maybe_list, list):
                links = maybe_list
            else:
                links = [links]
        except Exception:
            links = [links]
    if not isinstance(links, list):
        return None
    for u in links:
        if not isinstance(u, str):
            continue
        path = urlparse(u).path.lower()
        if path.endswith(".pdf"):
            return u
    return None

# pick the first pdf url per programme
df_prog["pdf_url"] = df_prog["info_links"].apply(first_pdf_url)


NameError: name 'df_prog' is not defined

## 9. Scrape data from courses pages

### Build df_courses seed from df_subj
Creates a unique list of course codes and a resolvable course URL per code.
Uses the first programme URL that contains the code to build the course URL.

In [8]:
# Build df_courses with unique course codes and a resolvable course URL
# keep non empty codes
codes = (
    df_subj[["programme_url","code"]]
    .dropna(subset=["code"])
    .query("code.str.strip() != ''", engine="python")
    .drop_duplicates(subset=["code"])
    .copy()
)

# construct a course URL using the first programme base for that code
def make_course_url(row):
    base = row["programme_url"].split("#")[0].rstrip("/")
    return f"{base}/{row['code']}#/"
codes["course_url"] = codes.apply(make_course_url, axis=1)

df_courses = codes[["code","course_url"]].reset_index(drop=True)
#df_courses.head()


## 10. Course scraper helpers

Adds safe text helpers and robust table readers.
Expands any dropdowns, then collects programme names
Provides a generic label finder inside any table and a pdf finder that scans the page.

In [9]:
# helpers for safe text, label search, and pdf discovery
import re, time, json, requests, os
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

COURSE_PDF_DIR = DATA_DIR / "bachelor_courses_pdf"
COURSE_PDF_DIR.mkdir(exist_ok=True)

def txt(el):
    try:
        return el.text.strip()
    except Exception:
        return ""

def first_text_or_blank(els):
    return txt(els[0]) if els else ""

def wait_for_course_dom():
    # wait for either a table or a paragraph block to be present
    WebDriverWait(driver, 20).until(
        lambda d: d.find_elements(By.CSS_SELECTOR, "table") or d.find_elements(By.CSS_SELECTOR, ".paragraph")
    )

def tables_on_page():
    return driver.find_elements(By.CSS_SELECTOR, "table")

def find_value_by_label(labels):
    """
    Search all tables for a row that contains any of the given labels.
    Return the text from the opposite cell or the last cell if there are many.
    """
    lab_norm = [l.lower() for l in labels]
    for tbl in tables_on_page():
        rows = tbl.find_elements(By.CSS_SELECTOR, "tr")
        for r in rows:
            cells = r.find_elements(By.CSS_SELECTOR, "td, th")
            if not cells:
                continue
            cell_texts = [txt(c) for c in cells]
            joined = " | ".join(cell_texts).lower()
            if any(l in joined for l in lab_norm):
                if len(cells) >= 2:
                    # assume last cell carries the value
                    return txt(cells[-1])
                else:
                    return joined
    return ""

def teaching_method_from_context():
    """
    Try nearest table that follows a paragraph titled Teaching method.
    Fallback to label based lookup inside any table.
    """
    # nearby table after a paragraph
    try:
        para = driver.find_element(By.XPATH, "//p[contains(., 'Teaching method')]")
        tbl = para.find_element(By.XPATH, "following-sibling::table[1]")
        cells = [txt(td) for td in tbl.find_elements(By.CSS_SELECTOR, "td")]
        cells = [c for c in cells if c]
        if cells:
            return "; ".join(cells)
    except Exception:
        pass
    # fallback to label search inside tables
    return find_value_by_label(["Teaching method", "Teaching methods"])

def click_all_dropdowns():
    # expand any dropdown sections that list programme memberships
    headers = driver.find_elements(By.CSS_SELECTOR, ".dropdown-header")
    for h in headers:
        try:
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", h)
            time.sleep(0.2)
            h.click()
            time.sleep(0.4)
        except Exception:
            continue

def list_programmes_from_dropdown():
    # gather programme names from expanded sections
    names = []
    for sec in driver.find_elements(By.CSS_SELECTOR, ".dropdown-section"):
        for el in sec.find_elements(By.CSS_SELECTOR, "a, li, p, span"):
            t = txt(el)
            if t and len(t) > 2:
                names.append(t)
    # unique preserve order
    seen = set()
    out = []
    for n in names:
        if n not in seen:
            seen.add(n)
            out.append(n)
    return out



## 11. Parse a single course page

In [10]:
def parse_course_page(course_url, code):
    # open and wait
    driver.get(course_url)
    dismiss_cookies()
    wait_for_course_dom()

    # course level and coordinator by label search
    level = find_value_by_label(["Course level", "Level"])
    coordinator = find_value_by_label(["Course coordinator", "Coordinator"])

    # faculty by label, then fallback to table row position
    faculty = find_value_by_label(["Faculty", "Department", "School"])
    if not faculty:
        try:
            # fallback exact position
            faculty = driver.find_element(By.CSS_SELECTOR, "tr:nth-child(6) td:last-child").text.strip()
        except Exception:
            faculty = ""

    # teaching method with nearby context or label
    teaching_method = teaching_method_from_context()

    # expand dropdowns and collect programme memberships
    click_all_dropdowns()
    course_programmes = "; ".join(list_programmes_from_dropdown())

    # paragraphs block
    paragraphs = [txt(p) for p in driver.find_elements(By.CSS_SELECTOR, ".paragraph") if txt(p)]
    paragraphs_json = json.dumps(paragraphs, ensure_ascii=False)

    # final fallbacks if still blank
    if not level:
        try:
            level = driver.find_element(By.CSS_SELECTOR, "tr:nth-child(4) td:last-child").text.strip()
        except Exception:
            pass
    if not coordinator:
        try:
            coordinator = driver.find_element(By.CSS_SELECTOR, "tr:nth-child(7) td:last-child").text.strip()
        except Exception:
            pass

    return {
        "code": code,
        "faculty": faculty,
        "course_level": level,
        "course_coordinator": coordinator,
        "teaching_method": teaching_method,
        "course_programmes": course_programmes,
        "course_paragraphs_json": paragraphs_json
    }



## 12. Run the course scrape and build df_courses_full

In [14]:
# reduce df_courses for testing
#df_courses = df_courses.head(3).copy()

In [12]:
from tqdm import tqdm

rows = []
for _, row in tqdm(df_courses.iterrows(), total=len(df_courses), desc="Courses"):
    url = row["course_url"]
    code = row["code"]
    try:
        data = parse_course_page(url, code)
        rows.append(data)
    except Exception as e:
        rows.append({
            "code": code,
            "faculty": "",
            "course_level": "",
            "course_coordinator": "",
            "teaching_method": "",
            "course_programmes": "",
            "course_paragraphs_json": "[]"
        })

df_courses_full = pd.DataFrame(rows).drop_duplicates(subset=["code"]).reset_index(drop=True)
print(df_courses_full.head())

# 87 min run time for 420 courses

Courses: 100%|██████████| 205/205 [42:46<00:00, 12.52s/it]

       code             faculty course_level         course_coordinator  \
0   XB_0102  Faculty of Science          200   dr. M. Verano Merino MSc   
1  X_401085  Faculty of Science          100  dr. I.G. Gerostathopoulos   
2   XB_0101  Faculty of Science          200                 dr. J. Pei   
3  X_401008  Faculty of Science          200           dr. J. Endrullis   
4   XB_0040  Faculty of Science          200           dr. T.C. Beinema   

                          teaching_method  \
0                      Practical, Lecture   
1  Seminar, Written partial exam, Lecture   
2  Seminar, Written partial exam, Lecture   
3        Computer lab, Practical, Lecture   
4                    Study Group, Lecture   

                                   course_programmes  \
0        Hide full list (1); Artificial Intelligence   
1        Hide full list (1); Artificial Intelligence   
2        Hide full list (1); Artificial Intelligence   
3  Hide full list (6); Artificial Intelligence; B...  




## 13. Left join and ordering of the dataframes

In [13]:
copy = df_courses_full.copy()

print(df_courses_full.head())

       code             faculty course_level         course_coordinator  \
0   XB_0102  Faculty of Science          200   dr. M. Verano Merino MSc   
1  X_401085  Faculty of Science          100  dr. I.G. Gerostathopoulos   
2   XB_0101  Faculty of Science          200                 dr. J. Pei   
3  X_401008  Faculty of Science          200           dr. J. Endrullis   
4   XB_0040  Faculty of Science          200           dr. T.C. Beinema   

                          teaching_method  \
0                      Practical, Lecture   
1  Seminar, Written partial exam, Lecture   
2  Seminar, Written partial exam, Lecture   
3        Computer lab, Practical, Lecture   
4                    Study Group, Lecture   

                                   course_programmes  \
0        Hide full list (1); Artificial Intelligence   
1        Hide full list (1); Artificial Intelligence   
2        Hide full list (1); Artificial Intelligence   
3  Hide full list (6); Artificial Intelligence; B...  

In [16]:
# left join subject data with course details on code
df_subj_full = df_subj.merge(df_courses_full, on="code", how="right")

# change order of columns for clarity
cols_order = ['code', 'course_name', 'programme_title', 'faculty', 'programme_url',
               'year_num', 'period', 'ects', 'course_level',
              'course_coordinator', 'teaching_method', 'course_programmes',
              'course_paragraphs_json']
df_subj_full = df_subj_full[cols_order]

df_subj_full.head()

Unnamed: 0,code,course_name,programme_title,faculty,programme_url,year_num,period,ects,course_level,course_coordinator,teaching_method,course_programmes,course_paragraphs_json
0,XB_0102,Applied Programming for AI,Artificial Intelligence,Faculty of Science,https://studiegids.vu.nl/en/Bachelor/2025-2026...,1,6.0,6,200,dr. M. Verano Merino MSc,"Practical, Lecture",Hide full list (1); Artificial Intelligence,"[""Course Objective\nThe course prepares studen..."
1,XB_0102,Applied Programming for AI,Artificial Intelligence,Faculty of Science,https://studiegids.vu.nl/en/Bachelor/2025-2026...,1,6.0,6,200,dr. M. Verano Merino MSc,"Practical, Lecture",Hide full list (1); Artificial Intelligence,"[""Course Objective\nThe course prepares studen..."
2,X_401085,Information Management,Artificial Intelligence,Faculty of Science,https://studiegids.vu.nl/en/Bachelor/2025-2026...,1,6.0,6,100,dr. I.G. Gerostathopoulos,"Seminar, Written partial exam, Lecture",Hide full list (1); Artificial Intelligence,"[""Course Objective\nThrough this course, stude..."
3,X_401085,Information Management,Artificial Intelligence,Faculty of Science,https://studiegids.vu.nl/en/Bachelor/2025-2026...,1,6.0,6,100,dr. I.G. Gerostathopoulos,"Seminar, Written partial exam, Lecture",Hide full list (1); Artificial Intelligence,"[""Course Objective\nThrough this course, stude..."
4,XB_0101,Project Conversational Agents,Artificial Intelligence,Faculty of Science,https://studiegids.vu.nl/en/Bachelor/2025-2026...,2,3.0,6,200,dr. J. Pei,"Seminar, Written partial exam, Lecture",Hide full list (1); Artificial Intelligence,"[""Course Objective\nDevelop a conversational a..."


In [18]:

# save for reuse
df_subj_full.to_csv(DATA_DIR / "df_courses_bronze.csv", index=False, encoding="utf-8-sig")
df_subj_full.to_json(DATA_DIR / "df_courses_bronze.json", orient="records", force_ascii=False, indent=2)