In [2]:
import requests
from bs4 import BeautifulSoup
import re

response = requests.get("https://www.handbook.uts.edu.au/subjects/alpha.html")
soup = BeautifulSoup(response.text, "html.parser")

In [10]:
links = soup.find_all('a', href=True)
pattern = re.compile(r'/\d{5}\.html')
all_subject_links = [x["href"] for x in links if pattern.search(x["href"])]
print(all_subject_links[:10])


['https://handbook.uts.edu.au/subjects/32998.html', 'https://handbook.uts.edu.au/subjects/32543.html', 'https://handbook.uts.edu.au/subjects/42145.html', 'https://handbook.uts.edu.au/subjects/42890.html', 'https://handbook.uts.edu.au/subjects/92681.html', 'https://handbook.uts.edu.au/subjects/92360.html', 'https://handbook.uts.edu.au/subjects/92376.html', 'https://handbook.uts.edu.au/subjects/92389.html', 'https://handbook.uts.edu.au/subjects/59720.html', 'https://handbook.uts.edu.au/subjects/59721.html']


In [28]:
from dataclasses import dataclass
from enum import Enum, auto

class SubjectType(Enum):
    Undergraduate = auto()
    Postgraduate = auto()
    NotSpecified = auto()

@dataclass
class SubjectInfo:
    subject_type: SubjectType
    number: int
    name: str
    prereqs: list[int]
    area: str

In [29]:
pages = {}

https://handbook.uts.edu.au/subjects/42890.html


In [35]:

all_subjects = []

def get_page(url):
    if url in pages:
        return pages[url]
    
    response = requests.get(subject_link)
    pages[url] = response
    return response

i = 0
for subject_link in all_subject_links:
    print(f"Extracting from [{i}]: {subject_link}")
    i += 1
    subject_number = subject_link.split("/")[-1][:-5]

    response = get_page(subject_link)
    if response.status_code != 200:
        print(f"Unsuccessful: {response.status_code}")
        print(response)
        break

    soup = BeautifulSoup(response.text, "html.parser")


    course_area_el = soup.find("a", class_="coursearea")
    course_area = course_area_el.text if course_area_el else ""

    requisite_el = [x for x in soup.find_all("em") if x.text.strip().startswith("Requisite")]
    requisites = [int(x.text) for x in requisite_el[0].find_all("a")] if requisite_el else []

    subject_name = soup.find("h1").text

    subject_type_str = [x.text for x in soup.find_all("p") if x.text == "Undergraduate" or x.text == "Postgraduate"]
    subject_type = SubjectType[subject_type_str[0]] if subject_type_str else SubjectType.NotSpecified
    
    new_subj = SubjectInfo(subject_type, subject_number, subject_name, requisites, course_area)
    all_subjects.append(new_subj)
    print(new_subj)


Extracting from [0]: https://handbook.uts.edu.au/subjects/32998.html
SubjectInfo(subject_type=<SubjectType.Postgraduate: 2>, number='32998', name='32998 .NET Application Development', prereqs=[32555], area='Information Technology')
Extracting from [1]: https://handbook.uts.edu.au/subjects/32543.html
SubjectInfo(subject_type=<SubjectType.Postgraduate: 2>, number='32543', name='32543 3D Animation', prereqs=[], area='Information Technology')
Extracting from [2]: https://handbook.uts.edu.au/subjects/42145.html
SubjectInfo(subject_type=<SubjectType.NotSpecified: 3>, number='42145', name='42145 3D Scanning Methods for Reverse Engineering', prereqs=[], area='Engineering')
Extracting from [3]: https://handbook.uts.edu.au/subjects/42890.html
SubjectInfo(subject_type=<SubjectType.Postgraduate: 2>, number='42890', name='42890 4G/5G Mobile Technologies', prereqs=[], area='Engineering')
Extracting from [4]: https://handbook.uts.edu.au/subjects/92681.html
SubjectInfo(subject_type=<SubjectType.Underg

In [43]:


for x in all_subjects:
    x.subject_type = str(x.subject_type).split(".")[1]


In [54]:
import json

with open("subjects.json", "w+") as f:
    f.write(json.dumps([x.__dict__ for x in all_subjects]))

In [59]:
# came from https://handbook.uts.edu.au/courses/c10476.html
# TODO: do for all courses
subject_groups = [
    "https://handbook.uts.edu.au/directory/stm91764.html",
    "https://handbook.uts.edu.au/directory/stm91765.html",
    "https://handbook.uts.edu.au/directory/maj03445.html",
    "https://handbook.uts.edu.au/directory/maj02901.html",
    "https://handbook.uts.edu.au/directory/maj10053.html",
    "https://handbook.uts.edu.au/directory/maj02080.html",
    "https://handbook.uts.edu.au/directory/maj02900.html",
    "https://handbook.uts.edu.au/directory/maj02092.html",
    "https://handbook.uts.edu.au/directory/maj03519.html",
]
course_subjects = []

for group_url in subject_groups:
    response = requests.get(group_url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all('a', href=True)
    pattern = re.compile(r'/\d{5}\.html')
    course_subjects.extend([x.text for x in links if pattern.search(x["href"])])
print(course_subjects)

['31265', '41039', '31268', '31269', '41092', '31271', '31272', '43030', '37181', '48024', '31251', '41080', '31016', '33130', '33116', '33230', '31277', '31275', '41900', '48730', '31261', '31338', '48450', '32009', '41890', '41891', '42036', '48436', '41903', '48033', '42037', '32011', '41905', '37233', '43025', '41076', '41173', '41052', '42028', '41175', '31250', '31005', '37262', '37161', '41174', '41171', '41172', '31250', '41004', '41040', '31243', '41077', '32146', '42028', '43023', '31256', '43024', '43025', '31005', '41043', '42050', '42913', '57304', '31257', '31247', '31245', '31258', '31276', '31280', '31266', '48270', '31255', '31097', '31282', '48730', '41180', '41900', '41181', '41182', '41183', '41184', '48436', '41185', '31260', '31777', '31080', '41019', '41021', '41889', '31927', '31262', '41020', '31263', '31250', '31748', '41025', '31260', '41026', '48433', '41052', '31777', '31242', '41889', '31927', '41001', '31253', '41113']


In [60]:
print(list(set(course_subjects)))

['41173', '42913', '41004', '37233', '41182', '31005', '41052', '31748', '41080', '31256', '41043', '31269', '31261', '41175', '31097', '32011', '31338', '41905', '31777', '31266', '41900', '41076', '31277', '57304', '31282', '33130', '32009', '31257', '31280', '43024', '31260', '41184', '42050', '43030', '41180', '43023', '31016', '41181', '31255', '37262', '31080', '41019', '31263', '41113', '31927', '48433', '31276', '41903', '48436', '31250', '31245', '33116', '31272', '41891', '31243', '41889', '42028', '42037', '31265', '48033', '33230', '41025', '41171', '37161', '41077', '41183', '41001', '48024', '31268', '31247', '31258', '31262', '43025', '31275', '32146', '31251', '41174', '41185', '41890', '31253', '48450', '41021', '48730', '41040', '42036', '37181', '41020', '31271', '41092', '31242', '41039', '41172', '41026', '48270']
