In [1]:
import requests
from bs4 import BeautifulSoup
import typing
from pathlib import Path

In [31]:
from enum import Enum
class Constants(Enum):
    CMU_SYLLABUS_REGISTRY_URL = 'https://canvas.cmu.edu/courses/sis_course_id:syllabus-registry' 
    CMU_CANVAS_URL = 'https://canvas.cmu.edu' 
    COOKIE = {
        'canvas_session': '0Xjy2uBxLZaKS9SOOt5d0w+7Q0vngHvo-ASQV8NUKFYboSHhDJ6FPiLcxqeN7RcL4PFfokmYQMku0GKaFXh4kul85-nF8VHxmVCbTPa0RYypbiIgex2L8AB3o2ltb-ScqWiHmeg43FMDBvCLgkr9C3UUF7xlbnu4-aOrqvQ2Ml0x7lElCk0w1DEqadI_CPhEQKX0r2V-iJSeAW21NrjRNe62wcSoziWd8elcQxfn1GmiXoLjYPX4qyOheXKKn5nHJokAwtpWEqI5KZJBhhmEc0fhzOYx2tYYM4rUZZ2_SmOPojvYbyXjd2KUEt_9Le-dnuGihmgExAxsDcBDaLW5bIQh8ipljug5jUqzAu55LmCQjN7mc6QjOHskWV8qm_IbYMmeqgqPOrBIC0Mlxxb8w7IMLdqfKopDl5B0yCKirbukZ6Sb3h1QxhjobtgeNg_n5ONwwTIXlgFiYJiwBT9rVvS28F_2UOiNOjWYcrFESgFBN5tZqCpeJatk9ibPMW7ire19kQ-9WsPxJ_Q6mYoJ8kCqa0duQ59Y1EUbbTuu1s9D4AjGZ4d08IGm4_CRiUhssx_JxF_ijC4Gj_eL8r4RAT7YZm1NLf1c5INRm1BnOTN30-llz7CgjTEV3uITfmGTO3-hXURIgvHQAyc3kg7Q_md-ycit687FF_-bR6rS96FyNoVgs6YNBeiNw12wV2gIN1J-q9NbRynFedzpJ4MJGTxpzIpxGNXZR9EMVd7jC2SLo5nTdiV3awWPhEJ69z1FHv0GEQ063lU_HjGZWRpEgdXoqO2jnhb1Ol7D7PxdFGwtdJHfRqIgfaXK1SHvYN7WTEMia-GLlYEy-5IzdmDij_PLZymh2DqanBK8WjjQi65xw.tu12CybyCaTWXVkDEBqhmLb2d4Y.Z1_Vqw'
    }
    COURSE_CATEGORIES = [
        'Available Syllabi',
        'Unavailable Syllabi',
        'Individualized Experiences',
    ]

In [3]:
def get_and_unwrap(*args, **kwargs) -> BeautifulSoup:
    response = requests.get(*args, **kwargs)
    assert response.status_code == 200
    return BeautifulSoup(response.text, 'html.parser')
def select_unique(html, *args, **kwargs):
    result = html.select(*args, **kwargs)
    assert len(result) == 1, 'Selection is not unique!'
    return result[0]

In [97]:
from dataclasses import dataclass

@dataclass
class Immediate:
    syllabus_url: str
    file_name: str
    
@dataclass
class Webpage:
    webpage_url: str

In [119]:
class Course:
    def __init__(self, html, cat : str='Available Syllabi'):
        self.html = html
        self.a = select_unique(self.html, 'a.ig-title')
        self.cat = cat
    @property 
    def name(self):
        return self.a.get('title')
    def __repr__(self):
        return self.name
    @property
    def href(self):
        return Constants.CMU_CANVAS_URL.value + self.a.get('href')
    def get(self):
        self.result = get_and_unwrap(self.href, cookies=Constants.COOKIE.value)
    def analyze(self, html):
        archive = None
        try:
            # Immediate
            syllabus_url = select_unique(html, 'div#content a').get('href')
            file_name = select_unique(html, 'div#content h2').getText()
            archive = Immediate(syllabus_url, file_name)
        except: 
            pass
        
        try:
            select_unique(html, 'div#wiki_page_show')
            archive = Webpage(self.href)
        except:
            pass
        self.archive = archive
        return archive
    
class Department:
    def __init__(self, html):
        self.html = html
        self.a = select_unique(self.html, 'a.external_url_link')
    @property 
    def name(self):
        return self.a.get('title')
    def __repr__(self):
        return self.name
    @property
    def href(self):
        return self.a.get('href')
    @staticmethod
    def get_category(html, cat: str):
        category = select_unique(html, f'div[aria-label="{cat}"]')
        courses = category.select('div.content > ul.context_module_items > li[id^="context_module_item_"] > div.ig-row > div.ig-info > div.module-item-title > span.item_name')
        return courses
    def get(self):
        html = get_and_unwrap(self.href, cookies=Constants.COOKIE.value)
        self.courses = {cat: [Course(c, cat) for c in self.get_category(html, cat)] for cat in Constants.COURSE_CATEGORIES.value}

class Semester:
    def __init__(self, html):
        self.html = html
        departments = html.select('div.content > ul.context_module_items > li[id^="context_module_item_"] > div.ig-row > div.ig-info > div.module-item-title > span.item_name')
        self.departments = [Department(d) for d in departments]
    @property 
    def name(self):
        return self.html.get('aria-label')
    def __repr__(self):
        return self.name

class SyllabusRegistry:
    def __init__(self, html=None):
        if html is None:
            html = get_and_unwrap(Constants.CMU_SYLLABUS_REGISTRY_URL.value)
        self.html = html
        semesters = html.select('div[aria-label^="Fall"], div[aria-label^="String"], div[aria-label^="Summer"]')
        self.semesters = [Semester(s) for s in semesters]

In [None]:
html = get_and_unwrap(Constants.CMU_SYLLABUS_REGISTRY_URL.value)

In [None]:
sr = SyllabusRegistry(html=html)
f24 = sr.semesters[0]
f24_archi = f24.departments[0]
f24_archi.get()
f24_archi_courses = f24_archi.courses

f24_archi_courses['Available Syllabi'][3].href
f24_archi_courses['Available Syllabi'][3].get()
f24_archi_courses['Available Syllabi'][3].result

In [122]:
f24_cs = f24.departments[49]
f24_cs.get()
f24_cs_courses = f24_cs.courses

for course in f24_cs_courses['Available Syllabi']:
    course.get()
    analysis = course.analyze(course.result)
    if analysis == None:
        print('abnormal:', course.href)
    else:
        print(analysis)

Webpage(webpage_url='https://canvas.cmu.edu/courses/40981/modules/items/5809456')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40981/modules/items/5809457')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40981/modules/items/5809458')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40981/modules/items/5809459')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40981/modules/items/5809461')
Immediate(syllabus_url='/courses/40981/files/11446369/download?download_frd=1', file_name='15121-2.pdf')
Immediate(syllabus_url='/courses/40981/files/11446370/download?download_frd=1', file_name='15121-W.pdf')
Immediate(syllabus_url='/courses/40981/files/11426468/download?download_frd=1', file_name='15150-3.pdf')
Immediate(syllabus_url='/courses/40981/files/11426469/download?download_frd=1', file_name='15150-W.pdf')


KeyboardInterrupt: 

In [121]:
for semester in sr.semesters:
    print(semester.name)
    for department in semester.departments:
        print(department.name)
        department.get()
        for course in department.courses['Available Syllabi']:
            course.get()
            archive = course.analyze(course.result)
            if archive == None:
                print('abnormal:', course.href)
            else:
                print(archive)

Fall 2024 (F24)
Architecture (48XXX)
Webpage(webpage_url='https://canvas.cmu.edu/courses/40932/modules/items/5803322')
Immediate(syllabus_url='/courses/40932/files/11458633/download?download_frd=1', file_name='48531-A.pdf')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40932/modules/items/5803465')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40932/modules/items/5803464')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40932/modules/items/5803360')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40932/modules/items/5803380')
Webpage(webpage_url='https://canvas.cmu.edu/courses/40932/modules/items/5803387')
Immediate(syllabus_url='/courses/40932/files/11458634/download?download_frd=1', file_name='48771-A.pdf')
Art (60XXX)
Immediate(syllabus_url='/courses/40933/files/11496475/download?download_frd=1', file_name='60105-A.docx')
Immediate(syllabus_url='/courses/40933/files/11496476/download?download_frd=1', file_name='60105-B.docx')
Immediate(syllabus_url='/courses/4093

AssertionError: 

In [130]:
sr.semesters

[Fall 2024 (F24),
 Summer 2024 (N24),
 Summer 2024 (M24),
 Fall 2023 (F23),
 Summer 2023 (N23),
 Summer 2023 (M23),
 Fall 2022 (F22),
 Summer 2022 (N22),
 Summer 2022 (M22),
 Fall 2021 (F21),
 Summer 2021 (N21),
 Summer 2021 (M21),
 Fall 2020 (F20),
 Summer 2020 (N20),
 Summer 2020 (M20),
 Fall 2019 (F19),
 Summer 2019 (N19),
 Summer 2019 (M19)]

In [141]:
sr.semesters[0].html

<div aria-label="Fall 2024 (F24)" class="item-group-condensed context_module" data-module-id="275343" data-module-url="/courses/3769/modules/275343" data-workflow-state="active" id="context_module_275343" style="">
<a id="module_275343"></a>
<div class="ig-header header" id="275343">
<span class="sortable-handle reorder_module_link" style="display: none;" title="Drag to reorder modules">
<i aria-hidden="true" class="icon-drag-handle"></i>
</span>
<h2 class="screenreader-only">Fall 2024 (F24)</h2>
<span aria-controls="context_module_content_275343" aria-expanded="true" aria-label="Fall 2024 (F24) toggle module visibility" class="ig-header-title collapse_module_link ellipsis" href="/courses/3769/modules/275343/collapse" role="button" tabindex="0" title="Fall 2024 (F24)">
<i class="icon-mini-arrow-down"></i>
<span class="name">Fall 2024 (F24)</span>
</span>
<span aria-controls="context_module_content_275343" aria-expanded="false" aria-label="Fall 2024 (F24) toggle module visibility" class

In [143]:
archived_semesters = html.select('div[aria-label="Archive"] > div.content > ul > li > div.ig-row > div.ig-info > div.module-item-title > span.item_name > a')

In [152]:
class ArchivedSemester:
    def __init__(self, html):
        self.html = html
        page = get_and_unwrap(self.href)
        self.departments = page.select('div#wiki_page_show > div.show-content > p > a')
    @property
    def href(self):
        from urllib.parse import urljoin
        return urljoin(Constants.CMU_CANVAS_URL.value, self.html.get('href'))
    @property 
    def name(self):
        return self.html.getText()
    def __repr__(self):
        return self.name

In [153]:
s = ArchivedSemester(archived_semesters[0])

In [156]:
page = get_and_unwrap(s.href)

In [None]:
page.select('div#wiki_page_show > div.show-content > p > a')

[]

In [194]:
class WebDriver:
    def __init__(self, url=None):
        from selenium import webdriver
        driver = webdriver.Chrome()
        self.driver = driver
        driver.get('https://www.google.com')
        driver.add_cookie({'name': list(Constants.COOKIE.value.keys())[0], 'value': list(Constants.COOKIE.value.values())[0]})
        if url is not None:
            driver.get(url)
    def get(self, url):
        self.driver.get(url)
    @property
    def html(self):
        return BeautifulSoup(self.driver.page_source, 'html.parser')


In [195]:
driver = WebDriver(url=s.href)

In [196]:
bs = driver.html

In [197]:
bs.select('div#wiki_page_show > div.show-content > p > a')

[<a href="https://canvas.cmu.edu/courses/8303">Architecture (48XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8304">Art (60XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8305">BXA Intercollege Degree Programs (52XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8306">Biological Sciences (03XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8307">Biomedical Engineering (42XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8308">Business Administration (70XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8309">CFA Interdisciplinary (62XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8310">CIT Interdisciplinary (39XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8311">Carnegie Mellon University-Wide Studies (99XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8312">Center for the Arts in Society (64XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8313">Center for the Neural Basis of Cognition (86XXX)</a>,
 <a href="https://canvas.cmu.edu/courses/8314">Chemical Engineering