Numerlor · Numerlor · Jul 17, 2020 · Aug 8, 2020 · Aug 8, 2020 · Aug 18, 2020
diff --git a/bot/__main__.py b/bot/__main__.py
@@ -51,9 +51,6 @@
 bot.load_extension("bot.cogs.information")
 bot.load_extension("bot.cogs.jams")
 bot.load_extension("bot.cogs.moderation")
-bot.load_extension("bot.cogs.python_news")
-bot.load_extension("bot.cogs.off_topic_names")
-bot.load_extension("bot.cogs.reddit")
 bot.load_extension("bot.cogs.reminders")
 bot.load_extension("bot.cogs.site")
 bot.load_extension("bot.cogs.snekbox")

diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py
@@ -3,12 +3,13 @@
 import logging
 import re
 import sys
-from collections import OrderedDict
+from collections import defaultdict
 from contextlib import suppress
 from types import SimpleNamespace
-from typing import Dict, NamedTuple, Optional
+from typing import Dict, NamedTuple, Optional, List
 
 import discord
+from bs4 import BeautifulSoup
 from discord.ext import commands
 from requests import ConnectTimeout, ConnectionError, HTTPError
 from sphinx.ext import intersphinx
@@ -20,7 +21,6 @@
 from bot.decorators import with_role
 from bot.pagination import LinePaginator
 from bot.utils.messages import wait_for_deletion
-from .cache import async_cache
 from .parsing import get_symbol_markdown
 
 log = logging.getLogger(__name__)
@@ -55,15 +55,103 @@
 class DocItem(NamedTuple):
     """Holds inventory symbol information."""
 
-    base_url: str
-    relative_url: str
     package: str
     group: str
+    base_url: str
+    relative_url_path: str
+    symbol_id: str
 
     @property
     def url(self) -> str:
         """Return the absolute url to the symbol."""
-        return self.base_url + self.relative_url
+        return "".join((self.base_url, self.relative_url_path, "#", self.symbol_id))
+
+
+class QueueItem(NamedTuple):
+    """TODO"""
+
+    symbol: DocItem
+    soup: BeautifulSoup
+
+    def __eq__(self, other):
+        if isinstance(other, DocItem):
+            return self.symbol == other
+        return NamedTuple.__eq__(self, other)
+
+
+class TODO_PLACEHOLDER:
+    QUEUED = object()
+
+    def __init__(self):
+        self._queue: List[QueueItem] = list()
+        self._results = {}
+        self._urls = defaultdict(list)  # TODO
+        self._item_events: Dict[DocItem, asyncio.Event] = {}
+        self._parse_task = None
+
+    async def get_item(self, client_session, doc_item: DocItem):
+        """todo"""
+        if (symbol := self._results.get(doc_item)) is not None:
+            return symbol
+
+        page_url = doc_item.base_url + doc_item.relative_url_path
+        if (symbols_to_queue := self._urls[page_url]) is not self.QUEUED:
+            async with client_session.get(page_url) as response:
+                soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml")
+
+            is_parsing = bool(self._queue)
+            self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue)
+            self._urls[page_url] = self.QUEUED
+
+            if not is_parsing:
+                self._parse_task = asyncio.create_task(self._parse_queue())
+
+        self._move_to_front(doc_item)
+        self._item_events[doc_item] = item_event = asyncio.Event()
+        await item_event.wait()
+        return self._results[doc_item]
+
+    async def _parse_queue(self):
+        """Parse all item from `soup` and assign their result todo"""
+        log.trace(f"Starting queue parsing.")
+        while self._queue:
+            await asyncio.sleep(0.1)
+            item, soup = self._queue.pop()
+            self._results[item] = get_symbol_markdown(soup, item)
+            if (event := self._item_events.get(item)) is not None:
+                event.set()
+
+        self._parse_task = None
+        log.trace("Finished parsing queue.")
+
+    def _move_to_front(self, item):
+        """Move `item` to the front of the parse queue."""
+        # the parse queue stores soups along with the doc symbols,
+        # so we first get the queue item that contains both the symbol and soup, and then move it
+        item_index = self._queue.index(item)
+        queue_item = self._queue[item_index]
+
+        del self._queue[item_index]
+        self._queue.append(queue_item)
+
+    def add_item(self, doc_item: DocItem):
+        """TODO"""
+        self._urls[doc_item.base_url + doc_item.relative_url_path].append(doc_item)
+
+    async def clear(self):
+        """
+        Clear all internal symbol data.
+
+        All currently requested items are waited to be parsed before clearing.
+        """
+        for event in self._item_events.values():
+            await event.wait()
+        if self._parse_task is not None:
+            self._parse_task.cancel()
+        self._queue.clear()
+        self._results.clear()
+        self._urls.clear()
+        self._item_events.clear()
 
 
 class InventoryURL(commands.Converter):
@@ -106,8 +194,9 @@ def __init__(self, bot: Bot):
         self.bot = bot
         self.doc_symbols: Dict[str, DocItem] = {}
         self.renamed_symbols = set()
-
+        self.placeholder = TODO_PLACEHOLDER()
         self.bot.loop.create_task(self.init_refresh_inventory())
+        # TODO decide on objects/doc_symbols merging and what needs to be kept in doc_symbols when urls exist
 
     async def init_refresh_inventory(self) -> None:
         """Refresh documentation inventory on cog initialization."""
@@ -163,8 +252,11 @@ async def update_single(
                         # Split `package_name` because of packages like Pillow that have spaces in them.
                         symbol = f"{api_package_name}.{symbol}"
                         self.renamed_symbols.add(symbol)
-
-                self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name)
+                # TODO remove comment above, clean up
+                relative_url_path, _, symbol_id = relative_doc_url.partition("#")
+                symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id)
+                self.doc_symbols[symbol] = symbol_item
+                self.placeholder.add_item(symbol_item)
 
         log.trace(f"Fetched inventory for {api_package_name}.")
 
@@ -178,7 +270,7 @@ async def refresh_inventory(self) -> None:
         self.base_urls.clear()
         self.doc_symbols.clear()
         self.renamed_symbols.clear()
-        async_cache.cache = OrderedDict()
+        await self.placeholder.clear()
 
         # Run all coroutines concurrently - since each of them performs a HTTP
         # request, this speeds up fetching the inventory data heavily.
@@ -198,13 +290,12 @@ async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]:
         symbol_info = self.doc_symbols.get(symbol)
         if symbol_info is None:
             return None
-        self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
-        embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info)
 
+        self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
         embed = discord.Embed(
             title=discord.utils.escape_markdown(symbol),
             url=symbol_info.url,
-            description=embed_description
+            description=await self.placeholder.get_item(self.bot.http_session, symbol_info)
         )
         # Show all symbols with the same name that were renamed in the footer.
         embed.set_footer(
@@ -217,6 +308,11 @@ async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> N
         """Lookup documentation for Python symbols."""
         await ctx.invoke(self.get_command, symbol=symbol)
 
+    @commands.command()
+    async def command(self, ctx):
+        for symbol in ["arcade", "arcade"]:
+                               await self.get_command(ctx, symbol=symbol)
+
     @docs_group.command(name='getdoc', aliases=('g',))
     async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None:
         """

diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py
@@ -1,16 +1,26 @@
 from urllib.parse import urljoin
 
+from bs4 import BeautifulSoup
 from bs4.element import PageElement
 from markdownify import MarkdownConverter
 
-
+FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__'
+wrapped = '<div id="%s">%%s</div>' % FRAGMENT_ID
 class _DocMarkdownConverter(MarkdownConverter):
     """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
 
     def __init__(self, *, page_url: str, **options):
         super().__init__(**options)
         self.page_url = page_url
 
+    def convert(self, html):
+        # We want to take advantage of the html5 parsing, but we don't actually
+        # want a full document. Therefore, we'll mark our fragment with an id,
+        # create the document, and extract the element with the id.
+        html = wrapped % html
+        soup = BeautifulSoup(html, 'lxml')
+        return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True)
+
     def convert_li(self, el: PageElement, text: str) -> str:
         """Fix markdownify's erroneous indexing in ol tags."""
         parent = el.parent

diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
@@ -89,9 +89,9 @@ def _get_general_description(start_element: PageElement) -> Optional[str]:
     """
     header = start_element.find_next("a", attrs={"class": "headerlink"})
     start_tag = header.parent if header is not None else start_element
-    description = "".join(
-        str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
-    )
+    result_tags = _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
+    description = "".join(str(tag) for tag in result_tags)
+
 
     return description
 
@@ -100,7 +100,9 @@ def _get_dd_description(symbol: PageElement) -> str:
     """Get the string contents of the next dd tag, up to a dt or a dl tag."""
     description_tag = symbol.find_next("dd")
     description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
-    return "".join(str(tag) for tag in description_contents)
+    description = "".join(str(tag) for tag in description_contents)
+
+    return description
 
 
 def _get_signatures(start_signature: PageElement) -> List[str]:
@@ -190,45 +192,38 @@ def _match_end_tag(tag: Tag) -> bool:
     return tag.name == "table"
 
 
-async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str:
+def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str:
     """
     Return parsed markdown of the passed symbol, truncated to 1000 characters.
 
     A request through `http_session` is made to the url associated with `symbol_data` for the html contents;
     the contents are then parsed depending on what group the symbol belongs to.
     """
-    log.trace(f"Parsing symbol from url {symbol_data.url}.")
-    if "#" in symbol_data.url:
-        request_url, symbol_id = symbol_data.url.rsplit('#')
-    else:
-        request_url = symbol_data.url
-        symbol_id = None
-
-    soup = await _get_soup_from_url(http_session, request_url)
-    symbol_heading = soup.find(id=symbol_id)
+    # log.trace(f"Parsing symbol from url {symbol_data.url}.")
+    symbol_heading = soup.find(id=symbol_data.symbol_id)
     signature = None
     # Modules, doc pages and labels don't point to description list tags but to tags like divs,
     # no special parsing can be done so we only try to include what's under them.
     if symbol_data.group in {"module", "doc", "label"}:
-        log.trace("Symbol is a module, doc or a label; using general description parsing.")
+        # log.trace("Symbol is a module, doc or a label; using general description parsing.")
         description = _get_general_description(symbol_heading)
 
     elif symbol_heading.name != "dt":
         # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags,
         # log info the tag can be looked at.
-        log.info(
-            f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it,"
-            f"handling as general description."
-        )
+        # log.info(
+        #     f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it,"
+        #     f"handling as general description."
+        # )
         description = _get_general_description(symbol_heading)
 
     elif symbol_data.group in _NO_SIGNATURE_GROUPS:
-        log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.")
+        # log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.")
         description = _get_dd_description(symbol_heading)
 
     else:
-        log.trace("Parsing both signature and description of symbol.")
-        signature = _get_signatures(symbol_heading)
+        # log.trace("Parsing both signature and description of symbol.")
         description = _get_dd_description(symbol_heading)
+        signature = _get_signatures(symbol_heading)
 
     return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url)