In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import psycopg2


In [8]:
#Set up the WebDriver options
firefox_options = Options()
firefox_options.add_argument('--headless')
firefox_options.add_argument('--disable-gpu')
firefox_options.add_argument('--no-sandbox')
firefox_options.add_argument('--disable-dev-shm-usage')
url = 'https://kucourses.dk'

browser = webdriver.Firefox(options=firefox_options)
browser.get(url)

wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".w-full")))  # Adjust selector

# Infinite scroll loop
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
    # Scroll to bottom
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    try:
        # Wait until new courses are loaded (based on DOM height change)
        wait.until(lambda d: d.execute_script("return document.body.scrollHeight") > last_height)
        last_height = browser.execute_script("return document.body.scrollHeight")
    except:
        break

In [9]:
boxes = browser.find_elements(By.XPATH, "/html/body/div/div[1]/div/div/main/div[4]/div/a")

In [10]:
courses = []
i = 0
for box in boxes:
    if i%100 == 0:
        print(i)
    i += 1
    title = box.find_element(By.XPATH, ".//div[1]/div[1]/div[1]/h1").text
    course_id = box.find_element(By.XPATH, ".//div[1]/div[1]/div[1]/h2").text
    level = box.find_element(By.XPATH, ".//div[1]/div[1]/table/tbody/tr[1]/td[1]").text
    length = box.find_element(By.XPATH, ".//div[1]/div[1]/table/tbody/tr[1]/td[2]").text
    timeslot = box.find_element(By.XPATH, ".//div[1]/div[1]/table/tbody/tr[2]/td[1]").text
    group = box.find_element(By.XPATH, ".//div[1]/div[1]/table/tbody/tr[2]/td[2]").text
    description = box.find_element(By.XPATH, ".//div[1]/p").text
    exam_type = box.find_element(By.XPATH, ".//div[2]/div[1]").text
    pass_pct = box.find_element(By.XPATH, ".//div[2]/table/tbody/tr[1]").text
    median = box.find_element(By.XPATH, ".//div[2]/table/tbody/tr[2]").text
    mean = box.find_element(By.XPATH, ".//div[2]/table/tbody/tr[3]").text
    courses.append({
        'course_id': course_id,
        'title': title,
        'level': level,
        'length': length,
        'timeslot': timeslot,
        'group': group,
        'description': description,
        'exam_type': exam_type,
        'pass_pct': pass_pct,
        'median': median,
        'mean': mean
    })

0
100
200
300
400
500
600
700
800
900
1000


In [11]:
df = pd.DataFrame(courses)
display(df)
df.to_csv('courses.csv', index=False)
browser.quit()

Unnamed: 0,course_id,title,level,length,timeslot,group,description,exam_type,pass_pct,median,mean
0,NDAA09023U - SCIENCE,Advanced Algorithms and Data Structures (AADS),Master,ECTS: 7.5,Block(s): 2,Group(s): C,Algorithms is about finding scalable solutions...,Written (4h),Pass 83.43%,Median 7,Average 6.55
1,NPLK13004U - SCIENCE,Advanced Analytical Chemistry - Chromatography...,Master,ECTS: 7.5,Block(s): 2,Group(s): A,"Today, chromatography and mass spectometry are...",Oral (20m),Pass 80%,Median 7,Average 5.8
2,NPLK13003U - SCIENCE,Advanced Analytical Chemistry - Sampling and S...,Master,ECTS: 7.5,Block(s): 1,Group(s): A,Sampling and sample preparation is a cornersto...,Assignment (2d)\nOral (30m),Pass 100%,Median 7,Average 7.29
3,NBIK15005U - SCIENCE,Advanced Bacteriology 2,Master,ECTS: 7.5,Block(s): 2,Group(s): B,Theme 1: Bacterial physiologyBacterial metabol...,Oral (20m),Pass 100%,Median 10,Average 9.77
4,NBIK15003U - SCIENCE,Advanced Bacteriology1,Master,ECTS: 7.5,Block(s): 1,Group(s): A,Theme 1: Social bacterial interactionsBacteria...,Oral (25m),Pass 100%,Median 10,Average 9.89
...,...,...,...,...,...,...,...,...,...,...,...
1058,NDAK22003U - SCIENCE,Web Recommender Systems (WRS),Master,ECTS: 7.5,Block(s): 3,Group(s): B,The course objective is to offer an advanced i...,Oral (20m),Pass 75%,Median 7,Average 6.83
1059,NIFK19001U - SCIENCE,Working as a Consultant,Master,ECTS: 7.5,Block(s): 4,Group(s): C,The newspapers are full of examples where expe...,Oral (20m)\nAssignment,Pass 96.88%,Median 10,Average 9.29
1060,NFOK14002U - SCIENCE,Yeast Physiology and Applications,Master,ECTS: 7.5,Block(s): 2,Group(s): A,The course focuses on various aspects of yeast...,Oral (20m),Pass 100%,Median 10,Average 8.55
1061,NBIA04054U - SCIENCE,Zoofysiologi (Zoofys),Bachelor,ECTS: 7.5,Block(s): 4,Group(s): C,Zoofysiologien søger at forstå hvordan dyr fun...,Oral (20m),Pass 100%,Median 10,Average 9.1


In [None]:
conn = psycopg2.connect(
    dbname="courses",
    user="postgres",
    password="admin",
    host="localhost",
    port="5432"
)
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS kucourses1 (
    course_id TEXT PRIMARY KEY NOT NULL,
    title VARCHAR(255) NOT NULL,
    nickname VARCHAR(255),
    level VARCHAR(10) NOT NULL,
    length FLOAT NOT NULL,
    timeslot VARCHAR(255) NOT NULL,
    block_group VARCHAR(10) NOT NULL,
    description TEXT NOT NULL,
    pass_pct FLOAT NOT NULL,
    median FLOAT NOT NULL,
    mean FLOAT NOT NULL
);
""")
conn.commit()

In [83]:
conn = psycopg2.connect(
    dbname="courses",
    user="postgres",
    password="admin",
    host="localhost",
    port="5432"
)
cur = conn.cursor()
data = []
for i in range(len(df)):
    course_id = df['course_id'][i].split(' ')[0]
    title = df['title'][i].split('(')[0].strip()
    try:
        nickname = df['title'][i].split('(')[1].split(')')[0].strip()
    except:
        nickname = None
    level = df['level'][i]
    length = float(df['length'][i].split(' ')[1])
    timeslot = df['timeslot'][i].split(' ')[1]
    block_group = df['group'][i].split(' ')[1]
    description = df['description'][i].strip()
    try:
        pass_pct = float(df['pass_pct'][i].split(' ')[1][:-1])
        median = float(df['median'][i].split(' ')[1])
        mean = float(df['mean'][i].split(' ')[1])
    except:
        pass

    data.append((
        course_id, title, nickname, level, length, timeslot,
        block_group, description, pass_pct, median, mean
    ))

for i, row in enumerate(data):
    try:
        cur.execute("""
            INSERT INTO kucourses1 (
                course_id, title, nickname, level, length,
                timeslot, block_group, description, pass_pct, median, mean
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """, row)
    except Exception as e:
        #print(f"Error on row {i}: {row}")
        #print(e)
        conn.rollback()

conn.commit()
cur.close()
conn.close()



