diff --git a/bot/__main__.py b/bot/__main__.py index aa1d1aee87..74cfb2a95f 100644 --- a/bot/__main__.py +++ b/bot/__main__.py @@ -51,9 +51,6 @@ bot.load_extension("bot.cogs.information") bot.load_extension("bot.cogs.jams") bot.load_extension("bot.cogs.moderation") -bot.load_extension("bot.cogs.python_news") -bot.load_extension("bot.cogs.off_topic_names") -bot.load_extension("bot.cogs.reddit") bot.load_extension("bot.cogs.reminders") bot.load_extension("bot.cogs.site") bot.load_extension("bot.cogs.snekbox") diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index e52ee95c13..79e2f97bc8 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -3,12 +3,13 @@ import logging import re import sys -from collections import OrderedDict +from collections import defaultdict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional +from typing import Dict, NamedTuple, Optional, List import discord +from bs4 import BeautifulSoup from discord.ext import commands from requests import ConnectTimeout, ConnectionError, HTTPError from sphinx.ext import intersphinx @@ -20,7 +21,6 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion -from .cache import async_cache from .parsing import get_symbol_markdown log = logging.getLogger(__name__) @@ -55,15 +55,103 @@ class DocItem(NamedTuple): """Holds inventory symbol information.""" - base_url: str - relative_url: str package: str group: str + base_url: str + relative_url_path: str + symbol_id: str @property def url(self) -> str: """Return the absolute url to the symbol.""" - return self.base_url + self.relative_url + return "".join((self.base_url, self.relative_url_path, "#", self.symbol_id)) + + +class QueueItem(NamedTuple): + """TODO""" + + symbol: DocItem + soup: BeautifulSoup + + def __eq__(self, other): + if isinstance(other, DocItem): + return self.symbol == other + return NamedTuple.__eq__(self, other) + + +class TODO_PLACEHOLDER: + QUEUED = object() + + def __init__(self): + self._queue: List[QueueItem] = list() + self._results = {} + self._urls = defaultdict(list) # TODO + self._item_events: Dict[DocItem, asyncio.Event] = {} + self._parse_task = None + + async def get_item(self, client_session, doc_item: DocItem): + """todo""" + if (symbol := self._results.get(doc_item)) is not None: + return symbol + + page_url = doc_item.base_url + doc_item.relative_url_path + if (symbols_to_queue := self._urls[page_url]) is not self.QUEUED: + async with client_session.get(page_url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + + is_parsing = bool(self._queue) + self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + self._urls[page_url] = self.QUEUED + + if not is_parsing: + self._parse_task = asyncio.create_task(self._parse_queue()) + + self._move_to_front(doc_item) + self._item_events[doc_item] = item_event = asyncio.Event() + await item_event.wait() + return self._results[doc_item] + + async def _parse_queue(self): + """Parse all item from `soup` and assign their result todo""" + log.trace(f"Starting queue parsing.") + while self._queue: + await asyncio.sleep(0.1) + item, soup = self._queue.pop() + self._results[item] = get_symbol_markdown(soup, item) + if (event := self._item_events.get(item)) is not None: + event.set() + + self._parse_task = None + log.trace("Finished parsing queue.") + + def _move_to_front(self, item): + """Move `item` to the front of the parse queue.""" + # the parse queue stores soups along with the doc symbols, + # so we first get the queue item that contains both the symbol and soup, and then move it + item_index = self._queue.index(item) + queue_item = self._queue[item_index] + + del self._queue[item_index] + self._queue.append(queue_item) + + def add_item(self, doc_item: DocItem): + """TODO""" + self._urls[doc_item.base_url + doc_item.relative_url_path].append(doc_item) + + async def clear(self): + """ + Clear all internal symbol data. + + All currently requested items are waited to be parsed before clearing. + """ + for event in self._item_events.values(): + await event.wait() + if self._parse_task is not None: + self._parse_task.cancel() + self._queue.clear() + self._results.clear() + self._urls.clear() + self._item_events.clear() class InventoryURL(commands.Converter): @@ -106,8 +194,9 @@ def __init__(self, bot: Bot): self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} self.renamed_symbols = set() - + self.placeholder = TODO_PLACEHOLDER() self.bot.loop.create_task(self.init_refresh_inventory()) + # TODO decide on objects/doc_symbols merging and what needs to be kept in doc_symbols when urls exist async def init_refresh_inventory(self) -> None: """Refresh documentation inventory on cog initialization.""" @@ -163,8 +252,11 @@ async def update_single( # Split `package_name` because of packages like Pillow that have spaces in them. symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - - self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name) + # TODO remove comment above, clean up + relative_url_path, _, symbol_id = relative_doc_url.partition("#") + symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) + self.doc_symbols[symbol] = symbol_item + self.placeholder.add_item(symbol_item) log.trace(f"Fetched inventory for {api_package_name}.") @@ -178,7 +270,7 @@ async def refresh_inventory(self) -> None: self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() - async_cache.cache = OrderedDict() + await self.placeholder.clear() # Run all coroutines concurrently - since each of them performs a HTTP # request, this speeds up fetching the inventory data heavily. @@ -198,13 +290,12 @@ async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: symbol_info = self.doc_symbols.get(symbol) if symbol_info is None: return None - self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") - embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info) + self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") embed = discord.Embed( title=discord.utils.escape_markdown(symbol), url=symbol_info.url, - description=embed_description + description=await self.placeholder.get_item(self.bot.http_session, symbol_info) ) # Show all symbols with the same name that were renamed in the footer. embed.set_footer( @@ -217,6 +308,11 @@ async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> N """Lookup documentation for Python symbols.""" await ctx.invoke(self.get_command, symbol=symbol) + @commands.command() + async def command(self, ctx): + for symbol in ["arcade", "arcade"]: + await self.get_command(ctx, symbol=symbol) + @docs_group.command(name='getdoc', aliases=('g',)) async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: """ diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py index dca477d351..e365df2fdc 100644 --- a/bot/cogs/doc/markdown.py +++ b/bot/cogs/doc/markdown.py @@ -1,9 +1,11 @@ from urllib.parse import urljoin +from bs4 import BeautifulSoup from bs4.element import PageElement from markdownify import MarkdownConverter - +FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__' +wrapped = '
%%s
' % FRAGMENT_ID class _DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" @@ -11,6 +13,14 @@ def __init__(self, *, page_url: str, **options): super().__init__(**options) self.page_url = page_url + def convert(self, html): + # We want to take advantage of the html5 parsing, but we don't actually + # want a full document. Therefore, we'll mark our fragment with an id, + # create the document, and extract the element with the id. + html = wrapped % html + soup = BeautifulSoup(html, 'lxml') + return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True) + def convert_li(self, el: PageElement, text: str) -> str: """Fix markdownify's erroneous indexing in ol tags.""" parent = el.parent diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 1271953d47..d552dfaba7 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -89,9 +89,9 @@ def _get_general_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find_next("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join( - str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - ) + result_tags = _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) + description = "".join(str(tag) for tag in result_tags) + return description @@ -100,7 +100,9 @@ def _get_dd_description(symbol: PageElement) -> str: """Get the string contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) - return "".join(str(tag) for tag in description_contents) + description = "".join(str(tag) for tag in description_contents) + + return description def _get_signatures(start_signature: PageElement) -> List[str]: @@ -190,45 +192,38 @@ def _match_end_tag(tag: Tag) -> bool: return tag.name == "table" -async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str: """ Return parsed markdown of the passed symbol, truncated to 1000 characters. A request through `http_session` is made to the url associated with `symbol_data` for the html contents; the contents are then parsed depending on what group the symbol belongs to. """ - log.trace(f"Parsing symbol from url {symbol_data.url}.") - if "#" in symbol_data.url: - request_url, symbol_id = symbol_data.url.rsplit('#') - else: - request_url = symbol_data.url - symbol_id = None - - soup = await _get_soup_from_url(http_session, request_url) - symbol_heading = soup.find(id=symbol_id) + # log.trace(f"Parsing symbol from url {symbol_data.url}.") + symbol_heading = soup.find(id=symbol_data.symbol_id) signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. if symbol_data.group in {"module", "doc", "label"}: - log.trace("Symbol is a module, doc or a label; using general description parsing.") + # log.trace("Symbol is a module, doc or a label; using general description parsing.") description = _get_general_description(symbol_heading) elif symbol_heading.name != "dt": # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, # log info the tag can be looked at. - log.info( - f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it," - f"handling as general description." - ) + # log.info( + # f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it," + # f"handling as general description." + # ) description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: - log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") + # log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") description = _get_dd_description(symbol_heading) else: - log.trace("Parsing both signature and description of symbol.") - signature = _get_signatures(symbol_heading) + # log.trace("Parsing both signature and description of symbol.") description = _get_dd_description(symbol_heading) + signature = _get_signatures(symbol_heading) return _parse_into_markdown(signature, description.replace('ΒΆ', ''), symbol_data.url)