In [24]:
# ────────────────────────────────────────────────────────────────
# 📦  Cell 1 – one-time setup
#     (installs Crawl4AI, Playwright + its browsers, and helpers)
# ────────────────────────────────────────────────────────────────
%pip install --quiet "crawl4ai[playwright]" requests tqdm nest_asyncio

import subprocess, sys, nest_asyncio
nest_asyncio.apply()

# install headless Chromium etc. (only the first time on a machine)
subprocess.run(
    [sys.executable, "-m", "playwright", "install", "--with-deps"],
    check=True,
)


Note: you may need to restart the kernel to use updated packages.


CompletedProcess(args=['/opt/anaconda3/bin/python', '-m', 'playwright', 'install', '--with-deps'], returncode=0)

In [26]:
# 🔧 install the missing converter
%pip install --quiet markdownify beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


In [28]:
# ────────────────────────────────────────────────────────────────
# 🏃  Cell 2 – scrape an Essex course page to Markdown (cookie-free)
# ────────────────────────────────────────────────────────────────
import asyncio, os
from pathlib import Path
from urllib.parse import urljoin, urlsplit

import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from tqdm.auto import tqdm

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
# → If you prefer pure-HTTP (no browser) crawling, swap these lines:
# from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy

# ─────────── helpers ────────────────────────────────────────────
def _safe_name(u: str, idx: int) -> str:
    """Return a numbered filename preserving the URL’s extension."""
    ext = os.path.splitext(urlsplit(u).path)[1] or ".jpg"
    return f"{idx:03d}{ext}"

COOKIE_SELECTORS = [
    "#onetrust-banner-sdk",
    "#onetrust-consent-sdk",
    ".cookie-bar",
    ".ot-sdk-container",
]

async def scrape_to_markdown(
    url: str,
    out_md: Path = Path("output.md"),
    assets_dir: Path = Path("assets"),
):
    """Fetch `url`, clean it, download images, write Markdown."""
    run_cfg = CrawlerRunConfig(
        css_selector="#content",                 # Essex main wrapper
        wait_for_images=True,
        remove_overlay_elements=True,
        js_code="""(() => {
            const btn = document.querySelector('#onetrust-accept-btn-handler');
            if (btn) btn.click();
        })();""",                               # accept cookies
        markdown_generator=DefaultMarkdownGenerator(
            options={"ignore_links": False, "ignore_images": False}
        ),
        verbose=True,
    )

    # async with AsyncWebCrawler(crawler_strategy=AsyncHTTPCrawlerStrategy()) as crawler:
    async with AsyncWebCrawler() as crawler:    # Playwright default
        result = await crawler.arun(url, config=run_cfg)

    if not result.success:
        raise RuntimeError(result.error_message)

    # —— clean HTML ——————————————————————————
    html_content = result.html                 # str in crawl4ai 0.6.2
    soup = BeautifulSoup(html_content, "html.parser")

    for sel in COOKIE_SELECTORS:
        for node in soup.select(sel):
            node.decompose()                   # remove banner nodes

    # —— download images & relink ——————————
    assets_dir.mkdir(exist_ok=True)
    session = requests.Session()

    for idx, img in enumerate(
        tqdm(soup.find_all("img"), desc="images", unit="img")
    ):
        src_attr = img.get("data-src") or img.get("src")
        if not src_attr:
            continue
        full_url = urljoin(url, src_attr)      # make absolute
        try:
            r = session.get(full_url, timeout=30)
            r.raise_for_status()
            fname = _safe_name(full_url, idx)
            local_path = assets_dir / fname
            local_path.write_bytes(r.content)
            img["src"] = f"{assets_dir}/{fname}"   # point Markdown here
        except Exception as e:
            print(f"⚠ cannot fetch {full_url}: {e}")

    # —— HTML → Markdown ————————————————————
    markdown = md(str(soup), heading_style="ATX", strip=["script", "style"])
    out_md.write_text(markdown, encoding="utf-8")
    print(f"\n✅  Wrote {out_md} and saved {len(list(assets_dir.glob('*')))} image(s).")
    return out_md


# ————————————————————————————————————————————————
# CHANGE ONLY THIS URL BETWEEN RUNS
url = "https://www.essex.ac.uk/courses/PG00742/2/MSc-Data-Science"
# ————————————————————————————————————————————————

await scrape_to_markdown(url)


[INIT].... → Crawl4AI 0.6.2
[FETCH]... ↓ https://www.essex.ac.uk/courses/PG00742/2/MSc-Data-Science                                           | ✓ | ⏱: 5.11s
[SCRAPE].. ◆ https://www.essex.ac.uk/courses/PG00742/2/MSc-Data-Science                                           | ✓ | ⏱: 0.10s
[COMPLETE] ● https://www.essex.ac.uk/courses/PG00742/2/MSc-Data-Science                                           | ✓ | ⏱: 5.23s


images:   0%|          | 0/10 [00:00<?, ?img/s]


✅  Wrote output.md and saved 11 image(s).


PosixPath('output.md')

In [14]:
# 📄  Trim everything above "## Overview" in output.md
from pathlib import Path, PurePosixPath

src  = Path("output.md")          # original file
dest = Path("output_clean.md")    # cleaned file

if not src.exists():
    raise FileNotFoundError("output.md not found – run the scraper first.")

lines = src.read_text(encoding="utf-8").splitlines()
try:
    start = next(i for i, ln in enumerate(lines) if ln.lstrip().startswith("## Overview"))
except StopIteration:
    raise ValueError("'## Overview' header not found in output.md")

dest.write_text("\n".join(lines[start:]), encoding="utf-8")

print(f"✅  Wrote {dest} ({len(lines) - start} lines). Preview ↓\n")
print("\n".join(lines[start:start+30]))   # first 30 lines of the cleaned file


✅  Wrote output_clean.md (440 lines). Preview ↓

## Overview

The details

Course:  Data Science with Professional Placement

Start date:  October 2025

Study mode:  Full-time

Duration:  2 years

Location:  Colchester Campus

Based in:  [Mathematics, Statistics and Actuarial Science (School of)](https://www.essex.ac.uk/departments/mathematics-statistics-and-actuarial-science)

The techniques we use to model and manipulate data guide the political, financial, and social decisions that shape our modern society and are the basis of economy growth and business success. Technology is growing and evolving at an incredible speed, and the growth rate of both the data we generate and the devices we use to process it can only increase.

Data science is a growing and important field of study with an increasing number of jobs and opportunities within both the private and public sectors. The application of theory and methods to real-world problems is at the core of data science, which aims especia

In [19]:
#!/usr/bin/env python3
"""
trim_overview.py
~~~~~~~~~~~~~~~~
Cut everything ABOVE the first "## Overview" heading in a Markdown file.

• Jupyter use:
      # just run the cell; it uses output.md → output_clean.md
• CLI use:
      python trim_overview.py input.md output_clean.md
"""

from pathlib import Path
import argparse
import sys

# ---------- defaults for notebook use ---------------------------------
DEFAULT_SRC  = Path("output.md")
DEFAULT_DEST = Path("output_clean.md")
ANCHOR       = "## Overview"
# ----------------------------------------------------------------------

def trim_above_anchor(src: Path, dest: Path, anchor: str = ANCHOR) -> None:
    if not src.exists():
        raise FileNotFoundError(f"{src} not found – did the scraper run?")
    lines = src.read_text(encoding="utf-8").splitlines()
    try:
        start_idx = next(i for i, ln in enumerate(lines)
                         if ln.lstrip().startswith(anchor))
    except StopIteration:
        raise ValueError(f"Heading '{anchor}' not found in {src}")
    dest.write_text("\n".join(lines[start_idx:]), encoding="utf-8")
    print(f"✅  Wrote {dest} "
          f"({len(lines) - start_idx} lines kept, {start_idx} removed).")

# ---------- run --------------------------------------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser(add_help=False)   # ignore -f etc.
    parser.add_argument("src",  nargs="?", default=DEFAULT_SRC)
    parser.add_argument("dest", nargs="?", default=DEFAULT_DEST)
    args, _ = parser.parse_known_args()                # eat unknown opts
    trim_above_anchor(Path(args.src), Path(args.dest), ANCHOR)
else:
    # running inside an imported cell (no CLI args) – do the default trim
    trim_above_anchor(DEFAULT_SRC, DEFAULT_DEST, ANCHOR)


ValueError: Heading '## Overview' not found in /Users/syedakash/Library/Jupyter/runtime/kernel-db93c9b4-4552-4dd5-98d1-dfed05b4298a.json