# Scraping of Programmes and Courses Data

Notebook pipeline that

1) Loads both pages in studiegids and collects all programme URLs
2) Clicks through every programme, then every track, then every year, and reads the visible tables
3) Enriches each programme with text from vu.nl base, curriculum, future, and admissions
4) Saves tidy CSV and JSON for programmes and subjects 

## 1) Import modules

In [1]:

import pandas as pd
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from scraper_modules.studiegids import StudiegidsScraper
from scraper_modules.vu_pages import VuPages


## 2) Setup directories, Selenium, Faculties

In [2]:

DATA_DIR = Path("data")
CACHE_DIR = DATA_DIR / "vu_cache"
DATA_DIR.mkdir(exist_ok=True, parents=True)

chrome_options = Options()
# chrome_options.add_argument("--headless=new")  # turn on in CI runs
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=chrome_options)

sg = StudiegidsScraper(driver, debug=False)
vu = VuPages(CACHE_DIR)

LISTING_URL = "https://studiegids.vu.nl/en/bachelor/2025-2026#/"
FACULTIES = ["School of Business and Economics", "Faculty of Science", "Faculty of Humanities"]


## 3) Programmes list

In [3]:
programmes = sg.list_programmes(LISTING_URL, FACULTIES)
len(programmes), programmes[:3]


(17,
 [{'title': 'Ancient Studies',
   'url': 'https://studiegids.vu.nl/en/Bachelor/2025-2026/ancient-studies'},
  {'title': 'Archaeology',
   'url': 'https://studiegids.vu.nl/en/Bachelor/2025-2026/archaeology'},
  {'title': 'Artificial Intelligence',
   'url': 'https://studiegids.vu.nl/en/Bachelor/2025-2026/artificial-intelligence'}])

## 4) Test only the three titles you mentioned

In [4]:
TEST_TITLES = {
    "Biomedical Sciences",
    "Philosophy, Politics and Economics",
    "Computer Science",
}
test_programmes = [p for p in programmes if p["title"] in TEST_TITLES]
test_programmes


[{'title': 'Biomedical Sciences',
  'url': 'https://studiegids.vu.nl/en/Bachelor/2025-2026/biomedical-sciences'},
 {'title': 'Computer Science',
  'url': 'https://studiegids.vu.nl/en/Bachelor/2025-2026/computer-science'},
 {'title': 'Philosophy, Politics and Economics',
  'url': 'https://studiegids.vu.nl/en/Bachelor/2025-2026/philosophy-politics-and-economics'}]

In [5]:
rows_subj = []
rows_prog = []

for item in test_programmes:
    title, url = item["title"], item["url"]
    rows = sg.parse_programme_studiegids(url, skip_honors=False, include_minors=True)
    print(f"{title} rows captured:", len(rows))
    for r in rows:
        r["programme_title"] = title
        r["programme_url"] = url
        rows_subj.append(r)
    # add what you were already collecting from tab two and vu pages if needed

df_subj = pd.DataFrame(rows_subj)
df_subj.head(10)


Biomedical Sciences rows captured: 240
Computer Science rows captured: 145
Philosophy, Politics and Economics rows captured: 37


Unnamed: 0,course_name,period,ects,code,track,year_label,programme_title,programme_url
0,Biochemistry - Research,5.0,6,AB_1164,,Biomedical Sciences year 1 constrained choice,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
1,Biomedical Sciences and Society,5.0,6,AB_1315,,Biomedical Sciences year 1 constrained choice,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
2,Science Communication for Researchers,5.0,6,AB_470185,,Biomedical Sciences year 1 constrained choice,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
3,English Language Test,1.0,0,VU_ELT,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
4,Genetics,1.0,6,AB_1135,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
5,Introduction to Biomedical Sciences,1.0,6,AB_1136,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
6,Lab safety,1.0,0,AB_LABSAFETY,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
7,Biochemistry,2.0,6,AB_1137,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
8,Cell Biology - Histology,2.0,6,AB_1138,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...
9,Research in Biomedical Sciences,3.0,6,AB_1142,,Biomedical Sciences year 1 compulsory courses,Biomedical Sciences,https://studiegids.vu.nl/en/Bachelor/2025-2026...


In [6]:
rows_subj = []
for item in programmes:
    title, url = item["title"], item["url"]
    rows = sg.parse_programme_studiegids(url, skip_honors=False)
    for r in rows:
        r["programme_title"] = title
        r["programme_url"] = url
        rows_subj.append(r)



KeyboardInterrupt: 

In [None]:
df_subj = pd.DataFrame(rows_subj).drop_duplicates().reset_index(drop=True)
df_subj.to_csv(DATA_DIR / "df_subj_temp_bronze.csv", index=False, encoding="utf-8-sig")
len(df_subj)