In [1]:
%%writefile scrape_emeritus_modules.py

import asyncio
from playwright.async_api import async_playwright
import pandas as pd

async def scrape_emeritus_program():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        await page.goto("https://classroom.emeritus.org/courses/12959/pages/program-outline-professional-certificate-in-machine-learning-and-artificial-intelligence?module_item_id=2328817")
        await page.wait_for_timeout(5000)

        # TODO: Update selectors after inspecting the actual page
        modules = await page.query_selector_all("h3, h4, ul")

        extracted = []
        current_module = {"Module": None, "Learning Outcomes": "", "Key Activities": ""}

        for el in modules:
            tag = await el.evaluate("el => el.tagName")
            text = (await el.inner_text()).strip()

            if tag == "H3":
                if current_module["Module"]:
                    extracted.append(current_module)
                current_module = {"Module": text, "Learning Outcomes": "", "Key Activities": ""}

            elif tag == "H4" and "Learning Outcomes" in text:
                sibling = await el.evaluate_handle("el => el.nextElementSibling")
                if sibling:
                    current_module["Learning Outcomes"] = (await sibling.inner_text()).strip()

            elif tag == "H4" and "Key Activities" in text:
                sibling = await el.evaluate_handle("el => el.nextElementSibling")
                if sibling:
                    current_module["Key Activities"] = (await sibling.inner_text()).strip()

        if current_module["Module"]:
            extracted.append(current_module)

        await browser.close()

        df = pd.DataFrame(extracted)
        df.to_csv("modules_scraped.csv", index=False)
        print("Scraping complete. Data saved to modules_scraped.csv")

if __name__ == "__main__":
    asyncio.run(scrape_emeritus_program())

Writing scrape_emeritus_modules.py


In [18]:
from bs4 import BeautifulSoup
import pandas as pd

with open("module_outline.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

modules = []

for summary in soup.find_all("summary"):
    module_title = summary.get_text(strip=True)
    container = summary.find_next_sibling("div")

    learning_outcomes = []
    key_activities = []

    if container:
        # Flatten the elements inside the container for linear search
        elements = container.find_all(recursive=True)

        i = 0
        while i < len(elements):
            el = elements[i]
            text = el.get_text(strip=True).lower()

            if "learning outcomes" in text:
                # Look ahead to next list
                for j in range(i+1, min(i+5, len(elements))):
                    if elements[j].name in ["ul", "ol"]:
                        learning_outcomes = [
                            li.get_text(strip=True) for li in elements[j].find_all("li")
                        ]
                        break

            if "key activities" in text:
                for j in range(i+1, min(i+5, len(elements))):
                    if elements[j].name == "ul":
                        key_activities = [
                            li.get_text(strip=True) for li in elements[j].find_all("li")
                        ]
                        break
            i += 1

    modules.append({
        "Module": module_title,
        "Learning Outcomes": "\n• " + "\n• ".join(learning_outcomes) if learning_outcomes else "",
        "Key Activities": "\n• " + "\n• ".join(key_activities) if key_activities else ""
    })

df = pd.DataFrame(modules)
df.to_csv("modules_scraped_clean.csv", index=False)
print("✅ Done. Output saved to modules_scraped_clean.csv")

✅ Done. Output saved to modules_scraped_clean.csv


In [19]:
from bs4 import BeautifulSoup
import pandas as pd

with open("module_outline.html", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

modules = []

for details in soup.find_all("details"):
    summary = details.find("summary")
    if not summary:
        continue

    # Get module title
    title_el = summary.find("strong") or summary.find("span")
    module_title = title_el.get_text(strip=True) if title_el else "Untitled Module"

    learning_outcomes = []
    key_activities = []

    # Search for strong tags that say "Learning Outcomes" or "Key Activities"
    strong_tags = details.find_all("strong")
    for strong in strong_tags:
        text = strong.get_text(strip=True).lower()

        if "learning outcomes" in text:
            ol = strong.find_parent().find_next_sibling("ol")
            if ol:
                learning_outcomes = [li.get_text(strip=True) for li in ol.find_all("li")]

        if "key activities" in text:
            ul = strong.find_parent().find_next_sibling("div")
            if ul:
                key_activities = [li.get_text(strip=True) for li in ul.find_all("li")]

    modules.append({
        "Module": module_title,
        "Learning Outcomes": "\n• " + "\n• ".join(learning_outcomes) if learning_outcomes else "",
        "Key Activities": "\n• " + "\n• ".join(key_activities) if key_activities else ""
    })

df = pd.DataFrame(modules)
df.to_csv("modules_scraped_fixed.csv", index=False)
print("✅ Clean export complete → modules_scraped_fixed.csv")

✅ Clean export complete → modules_scraped_fixed.csv
