In [3]:
import requests
from bs4 import BeautifulSoup
import typing
from pathlib import Path
from urllib.parse import urljoin
from selenium import webdriver
from dataclasses import dataclass

In [4]:
from enum import Enum
class Constants(Enum):
    CMU_SYLLABUS_REGISTRY_URL = 'https://canvas.cmu.edu/courses/sis_course_id:syllabus-registry' 
    CMU_CANVAS_URL = 'https://canvas.cmu.edu' 
    COOKIE = {
        'canvas_session': '0Xjy2uBxLZaKS9SOOt5d0w+7Q0vngHvo-ASQV8NUKFYboSHhDJ6FPiLcxqeN7RcL4PFfokmYQMku0GKaFXh4kul85-nF8VHxmVCbTPa0RYypbiIgex2L8AB3o2ltb-ScqWiHmeg43FMDBvCLgkr9C3UUF7xlbnu4-aOrqvQ2Ml0x7lElCk0w1DEqadI_CPhEQKX0r2V-iJSeAW21NrjRNe62wcSoziWd8elcQxfn1GmiXoLjYPX4qyOheXKKn5nHJokAwtpWEqI5KZJBhhmEc0fhzOYx2tYYM4rUZZ2_SmOPojvYbyXjd2KUEt_9Le-dnuGihmgExAxsDcBDaLW5bIQh8ipljug5jUqzAu55LmCQjN7mc6QjOHskWV8qm_IbYMmeqgqPOrBIC0Mlxxb8w7IMLdqfKopDl5B0yCKirbukZ6Sb3h1QxhjobtgeNg_n5ONwwTIXlgFiYJiwBT9rVvS28F_2UOiNOjWYcrFESgFBN5tZqCpeJatk9ibPMW7ire19kQ-9WsPxJ_Q6mYoJ8kCqa0duQ59Y1EUbbTuu1s9D4AjGZ4d08IGm4_CRiUhssx_JxF_ijC4Gj_eL8r4RAT7YZm1NLf1c5INRm1BnOTN30-llz7CgjTEV3uITfmGTO3-hXURIgvHQAyc3kg7Q_md-ycit687FF_-bR6rS96FyNoVgs6YNBeiNw12wV2gIN1J-q9NbRynFedzpJ4MJGTxpzIpxGNXZR9EMVd7jC2SLo5nTdiV3awWPhEJ69z1FHv0GEQ063lU_HjGZWRpEgdXoqO2jnhb1Ol7D7PxdFGwtdJHfRqIgfaXK1SHvYN7WTEMia-GLlYEy-5IzdmDij_PLZymh2DqanBK8WjjQi65xw.tu12CybyCaTWXVkDEBqhmLb2d4Y.Z1_Vqw'
    }
    COURSE_CATEGORIES = [
        'Available Syllabi',
        'Unavailable Syllabi',
        'Individualized Experiences',
    ]
    
def get_and_unwrap(*args, **kwargs) -> BeautifulSoup:
    response = requests.get(*args, **kwargs)
    assert response.status_code == 200
    return BeautifulSoup(response.text, 'html.parser')
def select_unique(html, *args, **kwargs):
    result = html.select(*args, **kwargs)
    if result == []:
        return None
    assert len(result) == 1, 'Selection is not unique!'
    return result[0]

@dataclass
class Immediate:
    syllabus_url: str
    file_name: str
    
@dataclass
class Webpage:
    webpage_url: str

In [5]:
class Course:
    def __init__(self, name, href, cat : str='Available Syllabi'):
        self.name = name
        self.href = urljoin(Constants.CMU_CANVAS_URL.value, href)
        self.cat = cat
    def __repr__(self):
        return self.name
    def get(self):
        self.result = get_and_unwrap(self.href, cookies=Constants.COOKIE.value)
    def analyze(self, html):
        archive = None
        try:
            # Immediate
            syllabus_url = select_unique(html, 'div#content a').get('href')
            file_name = select_unique(html, 'div#content h2').getText()
            archive = Immediate(syllabus_url, file_name)
        except: 
            pass
        
        try:
            select_unique(html, 'div#wiki_page_show')
            archive = Webpage(self.href)
        except:
            pass
        self.archive = archive
        return archive
    
class Department:
    def __init__(self, name, href):
        self.name = name
        self.href = href
    def __repr__(self):
        return self.name
    @staticmethod
    def get_category(html, cat: str):
        courses = html.select(f'div[aria-label="{cat}"] > div.content > ul.context_module_items > li[id^="context_module_item_"] > div.ig-row > div.ig-info > div.module-item-title > span.item_name > a.ig-title')
        return [Course(c.get('title'), c.get('href')) for c in courses]
    def get(self):
        html = get_and_unwrap(self.href, cookies=Constants.COOKIE.value)
        self.courses = {cat: self.get_category(html, cat) for cat in Constants.COURSE_CATEGORIES.value}

class Semester:
    def __init__(self, html):
        self.html = html
        departments = html.select('div.content > ul.context_module_items > li[id^="context_module_item_"] > div.ig-row > div.ig-info > div.module-item-title > span.item_name > a.external_url_link')
        self.departments = [Department(d.get('title'), d.get('href')) for d in departments]
    @property 
    def name(self):
        return self.html.get('aria-label')
    def __repr__(self):
        return self.name
    
class WebDriver:
    def __init__(self, url=None, headless=True):
        options = webdriver.ChromeOptions()
        options.headless = headless
        driver = webdriver.Chrome(options=options)
        self.driver = driver
        driver.get('https://httpbin.org/headers')
        driver.add_cookie({'name': list(Constants.COOKIE.value.keys())[0], 'value': list(Constants.COOKIE.value.values())[0]})
        if url is not None:
            driver.get(url)
    def get(self, url):
        self.driver.get(url)
    @property
    def html(self):
        return BeautifulSoup(self.driver.page_source, 'html.parser')
    def close(self):
        self.driver.close()

class ArchivedSemester:
    def __init__(self, name, href):
        self.name = name
        self.href = urljoin(Constants.CMU_CANVAS_URL.value, href)
        driver = WebDriver(url=self.href)
        html = driver.html
        driver.close()
        departments = html.select('div#wiki_page_show > div.show-content > p > a')
        self.departments = [Department(d.getText(), d.get('href')) for d in departments]
    def __repr__(self):
        return self.name

class SyllabusRegistry:
    def __init__(self, html=None, ignore_archived=False):
        if html is None:
            html = get_and_unwrap(Constants.CMU_SYLLABUS_REGISTRY_URL.value)
        self.html = html
        semesters = html.select('div[aria-label^="Fall"], div[aria-label^="String"], div[aria-label^="Summer"]')
        self.semesters = [Semester(s) for s in semesters]
        
        if not ignore_archived:
            archived_semesters = html.select('div[aria-label="Archive"] > div.content > ul > li > div.ig-row > div.ig-info > div.module-item-title > span.item_name > a')
            self.semesters += [ArchivedSemester(s.get('title'), s.get('href')) for s in archived_semesters]

In [6]:
html = get_and_unwrap(Constants.CMU_SYLLABUS_REGISTRY_URL.value)

In [7]:
sr = SyllabusRegistry(html=html)

f24 = sr.semesters[0]
f24_archi = f24.departments[0]
f24_archi.get()
f24_archi_courses = f24_archi.courses

f24_archi_courses['Available Syllabi'][3].href
f24_archi_courses['Available Syllabi'][3].get()
f24_archi_courses['Available Syllabi'][3].result

<!DOCTYPE html>

<html dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="#333333" name="theme-color"/>
<meta content="noindex,nofollow" name="robots">
<meta content="app-id=480883488" name="apple-itunes-app"/>
<link href="/web-app-manifest/manifest.json" rel="manifest"/>
<meta content="524ee4609c494b51801af96629a874c3-893939ebf57a4f3a-0" name="sentry-trace">
<title>48634-A: Architecture (48XXX)</title>
<link as="font" crossorigin="anonymous" href="https://du11hjcvx0uqb.cloudfront.net/dist/fonts/lato/extended/Lato-Regular-bd03a2cc27.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="anonymous" href="https://du11hjcvx0uqb.cloudfront.net/dist/fonts/lato/extended/Lato-Bold-cccb897485.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="anonymous" href="https://du11hjcvx0uqb.cloudfront.net/dist/fonts/lato/extended/Lato-Italic-4eb103b4d1.woff2" rel="preload" type="font/w

In [None]:
f24_cs = f24.departments[49]
f24_cs.get()

for course in f24_cs.courses['Available Syllabi']:
    course.get()
    analysis = course.analyze(course.result)
    if analysis == None:
        print('abnormal:', course.href)
    else:
        print(analysis)

In [None]:
for semester in sr.semesters:
    print(semester.name)
    for department in semester.departments:
        print(department.name)
        department.get()
        for course in department.courses['Available Syllabi']:
            course.get()
            archive = course.analyze(course.result)
            if archive == None:
                print('abnormal:', course.href)
            else:
                print(archive)