In [1]:
import requests
from bs4 import BeautifulSoup
import re

response = requests.get("https://www.handbook.uts.edu.au/subjects/alpha.html")
soup = BeautifulSoup(response.text, "html.parser")

In [2]:
links = soup.find_all('a', href=True)
pattern = re.compile(r'/\d{5}\.html')
all_subject_links: list[str] = [x["href"] for x in links if pattern.search(x["href"])]
print(all_subject_links[:10])
print(len(all_subject_links))


['https://handbook.uts.edu.au/subjects/32998.html', 'https://handbook.uts.edu.au/subjects/32543.html', 'https://handbook.uts.edu.au/subjects/42145.html', 'https://handbook.uts.edu.au/subjects/42890.html', 'https://handbook.uts.edu.au/subjects/92681.html', 'https://handbook.uts.edu.au/subjects/92360.html', 'https://handbook.uts.edu.au/subjects/92376.html', 'https://handbook.uts.edu.au/subjects/92389.html', 'https://handbook.uts.edu.au/subjects/59720.html', 'https://handbook.uts.edu.au/subjects/59721.html']
3148


In [None]:
from dataclasses import dataclass
from enum import Enum, auto

class SubjectType(Enum):
    Undergraduate = auto()
    Postgraduate = auto()
    NotSpecified = auto()

@dataclass
class SubjectInfo:
    number: int
    name: str
    prereqs_required: list[int]
    prereqs_options: list[str]

In [4]:
import os

# page cache files are named as
# "[subject number]-[retrieved at unix].txt"

import time
now = round(time.time())
retention_period = 60 * 60 * 24 * 365

def read(file):
    with open(os.path.join(cache_dir, file)) as f:
        return f.read()

cache_dir = "pages_cache"
pages_list = os.listdir(cache_dir)
pages = { int(p.split("-")[0]) : read(p) for p in pages_list }
print(pages)

all_subjects: list[SubjectInfo] = []

# Used to cache the requested pages because the uts handbook has a very slow rate limit
# it caches the entire page contents rather than any parsed info to the cache to still be useful when changing how the parsing works
def get_page(url, subject_number, retries = 0):
    if subject_number in pages:
        return pages[subject_number]
    
    response = requests.get(url)

    # retry after a delay if not successfull
    if response.status_code != 200:
        if retries > 3:
            raise Exception("failed :(")
        print(f"Failed to get page, going to try again {retries}: {response.status_code}")
        time.sleep(5)
        return get_page(url, subject_number, retries+1)

    pages[subject_number] = response.text
    with open(os.path.join(cache_dir, f"{subject_number}-{now}.txt"), "w+") as f:
        f.write(response.text)
    return response.text


for i, subject_link in enumerate(all_subject_links[:5]):
    print(f"[{i}] - Extracting from: {subject_link}")

    subject_number = int(subject_link.split("/")[-1][:-5]) # gets the filename, with .html removed
    response = get_page(subject_link, subject_number)
    

    soup = BeautifulSoup(response, "html.parser")


    course_area_el = soup.find("a", class_="coursearea")
    course_area = course_area_el.text if course_area_el else ""

    requisite_el = [x for x in soup.find_all("em") if x.text.strip().startswith("Requisite")][0]
    requisites = [int(x.text) for x in requisite_el[0].find_all("a")] if requisite_el else []

    subject_name = soup.find("h1").text

    subject_type_str = [x.text for x in soup.find_all("p") if x.text == "Undergraduate" or x.text == "Postgraduate"]
    subject_type = SubjectType[subject_type_str[0]] if subject_type_str else SubjectType.NotSpecified
    
    new_subj = SubjectInfo(subject_type, subject_number, subject_name, requisites, [], course_area)
    all_subjects.append(new_subj)
    print("extracted: ", new_subj)


{32998: '<!DOCTYPE html>\n<html>\n<head>\n    <meta http-equiv="content-type" content="text/html; charset=utf-8" />\n    <!-- Generic Metadata -->\n<meta name="description" content="The UTS: Handbook is the authoritative source of information on approved courses and subjects offered at University of Technology Sydney." />\n<meta name="author" content="University of Technology Sydney" />\n\n<!-- No cache Metadata -->\n<meta http-equiv="Cache-Control" content="no-cache" />\n<meta http-equiv="Pragma" content="no-cache" />\n\n<!-- Dublin Core Metadata -->\n<meta name="DC.Creator" content="University of Technology Sydney" />\n<meta name="DC.Title" content="University of Technology Sydney" />\n<meta name="DC.Description" content="The UTS: Handbook is the authoritative source of information on approved courses and subjects offered at University of Technology Sydney." />\n<meta name="DC.Publisher" content="University of Technology Sydney" />\n<meta name="DC.Date.Created" content="2009-10-30" /

KeyError: 0

In [None]:


for x in all_subjects:
    x.subject_type = str(x.subject_type).split(".")[1]


In [None]:
import json

with open("subjects.json", "w+") as f:
    f.write(json.dumps([x.__dict__ for x in all_subjects]))

In [None]:
# came from https://handbook.uts.edu.au/courses/c10476.html
# TODO: do for all courses
subject_groups = [
    "https://handbook.uts.edu.au/directory/stm91764.html",
    "https://handbook.uts.edu.au/directory/stm91765.html",
    "https://handbook.uts.edu.au/directory/maj03445.html",
    "https://handbook.uts.edu.au/directory/maj02901.html",
    "https://handbook.uts.edu.au/directory/maj10053.html",
    "https://handbook.uts.edu.au/directory/maj02080.html",
    "https://handbook.uts.edu.au/directory/maj02900.html",
    "https://handbook.uts.edu.au/directory/maj02092.html",
    "https://handbook.uts.edu.au/directory/maj03519.html",
]
course_subjects = []

for group_url in subject_groups:
    response = requests.get(group_url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all('a', href=True)
    pattern = re.compile(r'/\d{5}\.html')
    course_subjects.extend([x.text for x in links if pattern.search(x["href"])])
print(course_subjects)

In [None]:
print(list(set(course_subjects)))