Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions bot/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@
bot.load_extension("bot.cogs.information")
bot.load_extension("bot.cogs.jams")
bot.load_extension("bot.cogs.moderation")
bot.load_extension("bot.cogs.python_news")
bot.load_extension("bot.cogs.off_topic_names")
bot.load_extension("bot.cogs.reddit")
bot.load_extension("bot.cogs.reminders")
bot.load_extension("bot.cogs.site")
bot.load_extension("bot.cogs.snekbox")
Expand Down
122 changes: 109 additions & 13 deletions bot/cogs/doc/cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import logging
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The functionality the changes here implement is that instead of caching beautifulsoup objects like in the current implementation python-discord#1014 (which can bubble up and potentially become a memory issue) we store the lookup ids of each symbol, and when one symbol on a page is requested all others on that page are parsed alongside it so we can throw out the BS object when that's done.

import re
import sys
from collections import OrderedDict
from collections import defaultdict
from contextlib import suppress
from types import SimpleNamespace
from typing import Dict, NamedTuple, Optional
from typing import Dict, NamedTuple, Optional, List

import discord
from bs4 import BeautifulSoup
from discord.ext import commands
from requests import ConnectTimeout, ConnectionError, HTTPError
from sphinx.ext import intersphinx
Expand All @@ -20,7 +21,6 @@
from bot.decorators import with_role
from bot.pagination import LinePaginator
from bot.utils.messages import wait_for_deletion
from .cache import async_cache
from .parsing import get_symbol_markdown

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -55,15 +55,103 @@
class DocItem(NamedTuple):
"""Holds inventory symbol information."""

base_url: str
relative_url: str
package: str
group: str
base_url: str
relative_url_path: str
symbol_id: str

@property
def url(self) -> str:
"""Return the absolute url to the symbol."""
return self.base_url + self.relative_url
return "".join((self.base_url, self.relative_url_path, "#", self.symbol_id))


class QueueItem(NamedTuple):
"""TODO"""

symbol: DocItem
soup: BeautifulSoup

def __eq__(self, other):
if isinstance(other, DocItem):
return self.symbol == other
return NamedTuple.__eq__(self, other)


class TODO_PLACEHOLDER:
QUEUED = object()

def __init__(self):
self._queue: List[QueueItem] = list()
self._results = {}
self._urls = defaultdict(list) # TODO
self._item_events: Dict[DocItem, asyncio.Event] = {}
self._parse_task = None

async def get_item(self, client_session, doc_item: DocItem):
"""todo"""
if (symbol := self._results.get(doc_item)) is not None:
return symbol

page_url = doc_item.base_url + doc_item.relative_url_path
if (symbols_to_queue := self._urls[page_url]) is not self.QUEUED:
async with client_session.get(page_url) as response:
soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml")

is_parsing = bool(self._queue)
self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue)
self._urls[page_url] = self.QUEUED

if not is_parsing:
self._parse_task = asyncio.create_task(self._parse_queue())

self._move_to_front(doc_item)
self._item_events[doc_item] = item_event = asyncio.Event()
await item_event.wait()
Comment thread
Numerlor marked this conversation as resolved.
return self._results[doc_item]

async def _parse_queue(self):
"""Parse all item from `soup` and assign their result todo"""
log.trace(f"Starting queue parsing.")
while self._queue:
await asyncio.sleep(0.1)
Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to give the event loop a bit of control so the bot can stay alive, 0 as the sleep period maxes out the core the process is running in. Locally 0.1 halved that utilization (with around a 3x increase in the total time to parse some larger pages) but I don't know how this would reflect in the production bot

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea. Seems kind of hacky but I've seen this sort of thing done elsewhere before.

Copy link
Copy Markdown
Owner Author

@Numerlor Numerlor Aug 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another option would be to run this in a thread executor, but then we'd need to introduce some locks to prevent potential race conditions from the queue item movement. Giving the event loop control in this way works fine since the individual parsing doesn't take that long but I agree that it feels a bit hacky and the sleep period may influence things differently on different machines

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where my concurrency knowledge fails me. I thought the separate thread would still eat all the cycles. At what point would the context switch occur if one thread is giving no breathing room and is maxing out the core?

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main thread that's running asyncio is doing lots of IO so that's getting it a lot of opportunities for the executors to acquire the GIL; for a full utilization in the executors, the first thread should send a forced request after a certain amount of time to the thread that's holding the GIL which has to release it.
The behaviour you're describing only happens when the thread is doing something during which python can't handle the GIL like huge calculations where time is spent on a single instruction.
As an example

import threading


def cpu_and_memory_consumer():
    a = []
    while True:
        a.append(0)


def cpu_consumer():
    while True:
        pass


threads = [
    threading.Thread(target=cpu_consumer),
    threading.Thread(target=cpu_and_memory_consumer),
    #threading.Thread(target=lambda: 1234567890**1234567890),
]
for thread in threads:
    thread.start()
    print("started", thread)

Both of the functions take all that's available to them and shouldn't voluntarily release the GIL, but when you look at the memory usage it's clearly increasing even with the cpu_consumer started first (main thread and then the memory consumer both got the GIL to do their own thing), but then if we uncomment the computational thread everything will stop once it gets the GIL (No big memory usage from the appends and the started print never happens).

I'm of course not familiar with the devops of the bot and don't know if maxing out the available cpu is a good thing or not, in the threads without any sleep periods or through a 0 asyncio sleep

item, soup = self._queue.pop()
self._results[item] = get_symbol_markdown(soup, item)
if (event := self._item_events.get(item)) is not None:
event.set()

self._parse_task = None
log.trace("Finished parsing queue.")

def _move_to_front(self, item):
"""Move `item` to the front of the parse queue."""
# the parse queue stores soups along with the doc symbols,
# so we first get the queue item that contains both the symbol and soup, and then move it
item_index = self._queue.index(item)
queue_item = self._queue[item_index]

del self._queue[item_index]
self._queue.append(queue_item)

def add_item(self, doc_item: DocItem):
"""TODO"""
self._urls[doc_item.base_url + doc_item.relative_url_path].append(doc_item)

async def clear(self):
"""
Clear all internal symbol data.

All currently requested items are waited to be parsed before clearing.
"""
for event in self._item_events.values():
await event.wait()
if self._parse_task is not None:
self._parse_task.cancel()
self._queue.clear()
self._results.clear()
self._urls.clear()
self._item_events.clear()


class InventoryURL(commands.Converter):
Expand Down Expand Up @@ -106,8 +194,9 @@ def __init__(self, bot: Bot):
self.bot = bot
self.doc_symbols: Dict[str, DocItem] = {}
self.renamed_symbols = set()

self.placeholder = TODO_PLACEHOLDER()
self.bot.loop.create_task(self.init_refresh_inventory())
# TODO decide on objects/doc_symbols merging and what needs to be kept in doc_symbols when urls exist

async def init_refresh_inventory(self) -> None:
"""Refresh documentation inventory on cog initialization."""
Expand Down Expand Up @@ -163,8 +252,11 @@ async def update_single(
# Split `package_name` because of packages like Pillow that have spaces in them.
symbol = f"{api_package_name}.{symbol}"
self.renamed_symbols.add(symbol)

self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name)
# TODO remove comment above, clean up
relative_url_path, _, symbol_id = relative_doc_url.partition("#")
symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id)
self.doc_symbols[symbol] = symbol_item
self.placeholder.add_item(symbol_item)

log.trace(f"Fetched inventory for {api_package_name}.")

Expand All @@ -178,7 +270,7 @@ async def refresh_inventory(self) -> None:
self.base_urls.clear()
self.doc_symbols.clear()
self.renamed_symbols.clear()
async_cache.cache = OrderedDict()
await self.placeholder.clear()

# Run all coroutines concurrently - since each of them performs a HTTP
# request, this speeds up fetching the inventory data heavily.
Expand All @@ -198,13 +290,12 @@ async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]:
symbol_info = self.doc_symbols.get(symbol)
if symbol_info is None:
return None
self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info)

self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
embed = discord.Embed(
title=discord.utils.escape_markdown(symbol),
url=symbol_info.url,
description=embed_description
description=await self.placeholder.get_item(self.bot.http_session, symbol_info)
)
# Show all symbols with the same name that were renamed in the footer.
embed.set_footer(
Expand All @@ -217,6 +308,11 @@ async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> N
"""Lookup documentation for Python symbols."""
await ctx.invoke(self.get_command, symbol=symbol)

@commands.command()
async def command(self, ctx):
for symbol in ["arcade", "arcade"]:
await self.get_command(ctx, symbol=symbol)

@docs_group.command(name='getdoc', aliases=('g',))
async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None:
"""
Expand Down
12 changes: 11 additions & 1 deletion bot/cogs/doc/markdown.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from bs4.element import PageElement
from markdownify import MarkdownConverter


FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__'
wrapped = '<div id="%s">%%s</div>' % FRAGMENT_ID
class _DocMarkdownConverter(MarkdownConverter):
"""Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""

def __init__(self, *, page_url: str, **options):
super().__init__(**options)
self.page_url = page_url

def convert(self, html):
# We want to take advantage of the html5 parsing, but we don't actually
# want a full document. Therefore, we'll mark our fragment with an id,
# create the document, and extract the element with the id.
html = wrapped % html
soup = BeautifulSoup(html, 'lxml')
return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True)

def convert_li(self, el: PageElement, text: str) -> str:
"""Fix markdownify's erroneous indexing in ol tags."""
parent = el.parent
Expand Down
39 changes: 17 additions & 22 deletions bot/cogs/doc/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def _get_general_description(start_element: PageElement) -> Optional[str]:
"""
header = start_element.find_next("a", attrs={"class": "headerlink"})
start_tag = header.parent if header is not None else start_element
description = "".join(
str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
)
result_tags = _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
description = "".join(str(tag) for tag in result_tags)


return description

Expand All @@ -100,7 +100,9 @@ def _get_dd_description(symbol: PageElement) -> str:
"""Get the string contents of the next dd tag, up to a dt or a dl tag."""
description_tag = symbol.find_next("dd")
description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
return "".join(str(tag) for tag in description_contents)
description = "".join(str(tag) for tag in description_contents)

return description


def _get_signatures(start_signature: PageElement) -> List[str]:
Expand Down Expand Up @@ -190,45 +192,38 @@ def _match_end_tag(tag: Tag) -> bool:
return tag.name == "table"


async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str:
def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str:
"""
Return parsed markdown of the passed symbol, truncated to 1000 characters.

A request through `http_session` is made to the url associated with `symbol_data` for the html contents;
the contents are then parsed depending on what group the symbol belongs to.
"""
log.trace(f"Parsing symbol from url {symbol_data.url}.")
if "#" in symbol_data.url:
request_url, symbol_id = symbol_data.url.rsplit('#')
else:
request_url = symbol_data.url
symbol_id = None

soup = await _get_soup_from_url(http_session, request_url)
symbol_heading = soup.find(id=symbol_id)
# log.trace(f"Parsing symbol from url {symbol_data.url}.")
symbol_heading = soup.find(id=symbol_data.symbol_id)
signature = None
# Modules, doc pages and labels don't point to description list tags but to tags like divs,
# no special parsing can be done so we only try to include what's under them.
if symbol_data.group in {"module", "doc", "label"}:
log.trace("Symbol is a module, doc or a label; using general description parsing.")
# log.trace("Symbol is a module, doc or a label; using general description parsing.")
description = _get_general_description(symbol_heading)

elif symbol_heading.name != "dt":
# Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags,
# log info the tag can be looked at.
log.info(
f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it,"
f"handling as general description."
)
# log.info(
# f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it,"
# f"handling as general description."
# )
description = _get_general_description(symbol_heading)

elif symbol_data.group in _NO_SIGNATURE_GROUPS:
log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.")
# log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.")
description = _get_dd_description(symbol_heading)

else:
log.trace("Parsing both signature and description of symbol.")
signature = _get_signatures(symbol_heading)
# log.trace("Parsing both signature and description of symbol.")
description = _get_dd_description(symbol_heading)
signature = _get_signatures(symbol_heading)

return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url)