# Example Generator
Runs a script that parses scikit-learn examples and downloads the Jupyter Labs notebook for each example.

In [90]:
import os
import sys
from urllib.parse import urljoin

from bs4 import BeautifulSoup
import requests


base_url = 'https://scikit-learn.org/stable/auto_examples/index.html'

def error(message):
    print(message, file = sys.stderr)

def fetch(url, timeout = 5) -> str:
    response = requests.get(url, timeout = timeout)
    if response.status_code != 200:
        message = f'error fetching {base_url}: {response.status_code} ({response.reason})'
        raise ValueError(message)
    return response.text

def save_contents(contents: str, path: str):
    os.makedirs(os.path.dirname(path), exist_ok = True)
    with open(path, 'w') as file:
        file.write(contents)

def parse_examples_page(url, verbosity = 0) -> list:
    """
    Parse the examples page HTML to extract sections with examples.
    
    Returns: list of section dicts. Each section has a title, id, and list of examples.
    """
    html = fetch(url)
    element = BeautifulSoup(html)
    
    examples_section = element.find('section', id='examples')
    if not examples_section:
        raise ValueError("No examples section found on the page.")

    sections = []
    for section in examples_section.find_all('section', id=True):
        section_id = section['id']
        section_title = section.find('h2').text.replace('#', '').strip()
        if verbosity > 0:
            print(f"Section: '{section_title}'")
        examples = []
        example_links = section.find_all('a', class_ = 'reference', recursive = True)
        for link in example_links:
            example_url = urljoin(url, link['href'])
            example_title = link.text.strip()
            if verbosity > 0:
                print(f'\tExample: {example_title}')
            examples.append({'title': example_title, 'url': example_url})
        sections.append({
            'title': section_title,
            'id': section_id,
            'examples': examples
        })
    return sections

def download_notebook_for_example(example_url, notebook_path, verbosity = 0):
    """
    Create a Jupyter Labs notebook for an example at a particular URL.
    """
    html = fetch(example_url)
    element = BeautifulSoup(html)

    # Leverage the fact that each example page with source code contains a link to download a notebook:
    links = element.find_all('a')
    notebook_link = [link for link in links if link.get_text().strip().lower() == 'download jupyter notebook']
    if len(notebook_link) == 0:
        # Some documentation pages are merely for listing examples and do not contain a notebook. So it is typically safe to skip over pages that lack notebooks.
        if verbosity > 0:
            print(f'Missing Jupyter Labs notebook for {example_url}')
        return
    if len(notebook_link) > 1:
        raise ValueError(f'Found multiple Jupyter Labs notebooks for {example_url}')
    notebook_link = notebook_link[0]
    url = notebook_link.attrs['href']
    url = urljoin(example_url, url)
    if verbosity > 1:
        print(f'Downloading notebook {notebook_link}...')
    notebook_contents = fetch(url)

    # Save Results
    save_contents(notebook_contents, notebook_path)
    if verbosity > 0:
        print(f'Saved notebook {notebook_path}')

def generate_notebooks(url = base_url, verbosity = 0):
    """
    Downloads all notebooks for the scikit-learn examples page.
    """
    sections = parse_examples_page(url, verbosity - 1)
    for section in sections:
        section_name = section['title']
        for example in section['examples']:
            example_name, example_url = example['title'], example['url']
            example_path = f'examples/{section_name}/{example_name}.ipynb'
            try:
                download_notebook_for_example(example_url, example_path, verbosity = verbosity)
            except Exception as e:
                error(f'Error creating notebook for {example_url}: {e}')


In [None]:
generate_notebooks(verbosity = 1)

In [None]:
# debug: fetch html examples for inspection
debug = False
if debug:
    html = fetch(base_url)
    with open('examples.html', 'w') as file:
        file.write(html)
    examples = parse_examples_page(base_url)
    first_example = examples[0]['examples'][0]
    example_url = first_example['url']
    html = fetch(example_url)
    with open('example.html', 'w') as file:
        file.write(html)
    download_notebook_for_example(example_url, 'example.ipynb')