In [3]:
import io
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pickle
from datetime import datetime
from tqdm import tqdm
import os
from datetime import timezone
import json
import time

## Part 1. Obtaining Course Eval URLS

In [None]:
import io
import pandas as pd
sub = pd.read_csv(io.StringIO(SUBJECT_COOKIES), header=None)

In [19]:
sub = sub.set_index(0)[1].to_dict()
server = requests.Session()
server.cookies.update(sub)

In [None]:
def get_pages_by_subject(server, urls, path = 'data/raw/subject_pages.jsonl'):

    f = open(path, "w", encoding="utf-8")

    try:
        for url in tqdm(urls):

            rec = {"url": url, "fetched_at": datetime.now(timezone.utc).isoformat()}
            try:
                resp = server.get(url)
                text = resp.text or ""
                if "Academic Year" not in text:
                    rec.update({
                        "ok": False,
                        "status_code": getattr(resp, "status_code", None),
                        "error": "Missing 'Academic Year' marker", # Assume valid page should have this
                        "preview": text[:500]
                    })
                else:
                    rec.update({
                        "ok": True,
                        "status_code": getattr(resp, "status_code", None),
                        "html": text
                    })
            except requests.exceptions.RequestException as e:
                rec.update({
                    "ok": False,
                    "status_code": None,
                    "error": f"RequestException: {e}"
                })

            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            f.flush()

            time.sleep(2)
    finally:
        f.close()

    return path


In [None]:
with open('data/raw/course_codes.txt', 'r') as f:
    codes = f.read().splitlines()

urls = [f'https://coursefeedback.uchicago.edu/?Department={code}&AcademicYear=2024&AcademicTerm=All' for code in codes]

In [None]:
get_pages_by_subject(server, urls[0:2])

100%|██████████| 2/2 [00:23<00:00, 11.71s/it]


'data/course_urls.jsonl'

In [None]:
from bs4 import BeautifulSoup

with open('raw_data/pages_by_subject.jsonl', 'r', encoding='utf-8') as f:
    lines = f.readlines()

    links = []

    for line in lines:
        l = json.loads(line)
        if l.get("ok"):
            soup = BeautifulSoup(l.get("html", ""), "html.parser")
            for td in soup.find_all("td", class_="course"):
                a = td.find("a", href=True)
                if a:
                    links.append(a["href"])

with open('data/raw/course_urls.txt', 'w', encoding='utf-8') as f:
    for link in links:
        f.write(link + '\n')


## Part 2 Obtaining Course Eval HTMLs

In [None]:
import io
import pandas as pd
cookies=pd.read_csv(io.StringIO(COOKIES), header=None)


In [18]:
cookies = cookies.set_index(0)[1].to_dict()
s = requests.Session()
s.cookies.update(cookies)

In [None]:
import os, json, requests
from datetime import datetime
from tqdm import tqdm

def scrape_responses_jsonl(urls, session, out):

    successes = failures = 0

    with open(out, "a", encoding="utf-8") as out_f:
        for url in tqdm(urls):
            ts = datetime.utcnow().isoformat(timespec="seconds") + "Z"
            try:
                r = session.get(url, timeout=30)
                if "Number Enrolled" in r.text:
                    rec = {
                        "timestamp": ts,
                        "url": url,
                        "status_code": r.status_code,
                        "ok": True,
                        "text": r.text
                    }
                    successes += 1
                else:
                    rec = {
                        "timestamp": ts,
                        "url": url,
                        "status_code": r.status_code,
                        "ok": False,
                        "error": '"Number Enrolled" not found'
                    }
                    failures += 1

            except requests.exceptions.RequestException as e:
                rec = {
                    "timestamp": ts,
                    "url": url,
                    "status_code": None,
                    "ok": False,
                    "error": f"{type(e).__name__}: {e}"
                }
                failures += 1

            out_f.write(json.dumps(rec, ensure_ascii=False) + "\n")
            out_f.flush()

            time.sleep(0.3)

    return {"out_path": out, "successes": successes, "failures": failures}


In [None]:
with open('data/raw/course_urls.txt', 'r', encoding='utf-8') as f:
    course_urls = f.read().splitlines()

In [None]:
scrape_responses_jsonl(course_urls, s, out="data/raw/course_pages.jsonl")

100%|██████████| 17/17 [00:06<00:00,  2.57it/s]


{'out_path': 'raw_data/course_pages.jsonl', 'successes': 17, 'failures': 0}

## Part 3. Extracting Images

In [None]:
import os
import json
import base64
import hashlib
import mimetypes
from datetime import datetime
from typing import List, Dict, Any

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm


def fetch_course_image_urls(course_url, course_html):
    """
    Parse a course page's HTML and return a list of:
      { "question": <str>, "src": <absolute-url-str> }
    Skips blocks without an <img>.
    """
    soup = BeautifulSoup(course_html, "html.parser")
    out = []

    for block in soup.select(".FrequencyBlockRow"):
        q_el = block.select_one(".FrequencyQuestionTitle span")
        question = q_el.get_text(strip=True) if q_el else "No Question Text"
        img = block.select_one("img")
        src = img["src"] if img and "src" in img.attrs else None
        if not src:
            continue

        abs_src = urljoin(course_url or "", src)
        out.append({"question": question, "src": abs_src})

    return out


def build_image_bytes_jsonl(
    session,
    course_pages_path = "raw_data/course_pages.jsonl",
    out_path = "raw_data/image_bytes.jsonl",
):
    """
    Reads course_pages.jsonl lines of the form:
      {"url": "<page_url>", "ok": true, "text": "<html>..."}
    For each page, extracts images and writes one JSON object per line to `out_path`:
      {
        "url": "<page_url>",
        "images": [
          {
            "question": "...",
            "mime": "image/png",
            "sha256": "<hex>",
            "n_bytes": <int>,
            "data_base64": "<...>"
          },
          # or, on error:
          {
            "question": "...",
            "src": "https://...",
            "error": "RequestException: ..."
          }
        ]
      }
    """
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    pages = 0

    with open(course_pages_path, "r", encoding="utf-8") as src, \
         open(out_path, "w", encoding="utf-8") as dst:

        for line in tqdm(src, desc="Processing pages"):
            if not line.strip():
                continue

            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                continue

            if not (rec.get("ok") and rec.get("text") and rec.get("url")):
                continue

            page_url = rec["url"]
            html = rec["text"]

            items = fetch_course_image_urls(page_url, html)

            out_images: List[Dict[str, Any]] = []
            seen_src = set() 

            for it in items:
                q = it.get("question")
                if "Now that" not in q and "Prior to" not in q:
                    continue
                
                src_url = it.get("src")
                if not src_url or src_url in seen_src:
                    continue
                seen_src.add(src_url)

                try:
                    r = session.get(src_url, timeout=30)
                    r.raise_for_status()
                    data = r.content


                    b64 = base64.b64encode(data).decode("ascii")
                    mime = (r.headers.get("Content-Type")
                            or mimetypes.guess_type(src_url)[0]
                            or "application/octet-stream")
                    mime = mime.split(";")[0]
                    sha = hashlib.sha256(data).hexdigest()

                    out_images.append({
                        "question": q,
                        "mime": mime,
                        "sha256": sha,
                        "n_bytes": len(data),
                        "data_base64": b64
                    })

                except requests.RequestException as e:
                    out_images.append({
                        "question": q,
                        "src": src_url,
                        "error": f"{type(e).__name__}: {e}"
                    })

            dst.write(json.dumps({"url": page_url, "images": out_images}, ensure_ascii=False) + "\n")
            pages += 1

    return {"out_path": out_path, "pages_processed": pages}


In [None]:
result = build_image_bytes_jsonl(
        session=s,
        course_pages_path="data/raw/course_pages.jsonl",
        out_path="data/raw/image_bytes.jsonl"
    )

Processing pages: 17it [00:00, 23.94it/s]
