In [None]:
# Imports
from bs4 import BeautifulSoup
from requests import Session
from urllib.parse import urlparse

In [10]:
import io
import os
import re
import zlib

# Directly taken and modified from Rapptz/RoboDanny
# https://github.com/Rapptz/RoboDanny/blob/715a5cf8545b94d61823f62db484be4fac1c95b1/cogs/api.py
# This code is under the Mozilla Public License 2.0


class SphinxObjectFileReader:
    # Inspired by Sphinx's InventoryFileReader
    BUFSIZE = 16 * 1024

    def __init__(self, buffer):
        self.stream = io.BytesIO(buffer)

    def readline(self):
        return self.stream.readline().decode("utf-8")

    def skipline(self):
        self.stream.readline()

    def read_compressed_chunks(self):
        decompressor = zlib.decompressobj()
        while True:
            chunk = self.stream.read(self.BUFSIZE)
            if len(chunk) == 0:
                break
            yield decompressor.decompress(chunk)
        yield decompressor.flush()

    def read_compressed_lines(self):
        buf = b""
        for chunk in self.read_compressed_chunks():
            buf += chunk
            pos = buf.find(b"\n")
            while pos != -1:
                yield buf[:pos].decode("utf-8")
                buf = buf[pos + 1 :]
                pos = buf.find(b"\n")

    def parse_object_inv(self, url):
        # key: URL
        # n.b.: key doesn't have `discord` or `discord.ext.commands` namespaces
        result = {}

        # first line is version info
        inv_version = self.readline().rstrip()

        if inv_version != "# Sphinx inventory version 2":
            raise RuntimeError("Invalid objects.inv file version.")

        # next line is "# Project: <name>"
        # then after that is "# Version: <version>"
        projname = self.readline().rstrip()[11:]
        version = self.readline().rstrip()[11:]

        # next line says if it's a zlib header
        line = self.readline()
        if "zlib" not in line:
            raise RuntimeError("Invalid objects.inv file, not z-lib compatible.")

        # This code mostly comes from the Sphinx repository.
        entry_regex = re.compile(r"(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+(\S+)\s+(.*)")
        for line in self.read_compressed_lines():
            match = entry_regex.match(line.rstrip())
            if not match:
                continue

            name, directive, prio, location, dispname = match.groups()
            domain, _, subdirective = directive.partition(":")
            if directive == "py:module" and name in result:
                # From the Sphinx Repository:
                # due to a bug in 1.1 and below,
                # two inventory entries are created
                # for Python modules, and the first
                # one is correct
                continue

            # Most documentation pages have a label
            if directive == "std:doc":
                subdirective = "label"

            if location.endswith("$"):
                location = location[:-1] + name

            key = name if dispname == "-" else dispname
            prefix = f"{subdirective}:" if domain == "std" else ""

            result[f"{prefix}{key}"] = os.path.join(url, location)

        return result


In [11]:
base_url = "https://docs.python.org/3"

In [12]:
session = Session()

In [13]:
res = session.get(base_url + "/objects.inv")
sph = SphinxObjectFileReader(res.content)

In [14]:
data = sph.parse_object_inv(base_url)

In [15]:
keys = [key for key in list(data.keys()) if is_valid(key)]

NameError: name 'is_valid' is not defined

In [121]:
def is_valid(key: str):
    key = key.split(".")[-1]
    return (
        key
        and not key.replace("_", "").isupper()
        and not key.startswith("Py")
        and not key.startswith("label:")
        and key[0].isupper()
    )

In [122]:
import random

In [141]:
fullurl = data[random.choice(keys)]
url, id_ = fullurl.split("#")

In [142]:
url = urlparse(url).geturl().replace("\\", "/")

In [143]:
res2 = session.get(url)

In [147]:
soup = BeautifulSoup(res2.text, "html.parser")

'Raised when indentation contains an inconsistent use of tabs and spaces.\nThis is a subclass of'

In [148]:
base_element = soup.find("dt", {'id' : id_})
base_parent = base_element.parent

'TabError'

In [149]:
para = base_parent.find("dd").text

'Raised when indentation contains an inconsistent use of tabs and spaces.\nThis is a subclass of IndentationError.'

In [None]:
print(para)