-
Notifications
You must be signed in to change notification settings - Fork 0
Doc imp cop #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Doc imp cop #1
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,12 +3,13 @@ | |
| import logging | ||
| import re | ||
| import sys | ||
| from collections import OrderedDict | ||
| from collections import defaultdict | ||
| from contextlib import suppress | ||
| from types import SimpleNamespace | ||
| from typing import Dict, NamedTuple, Optional | ||
| from typing import Dict, NamedTuple, Optional, List | ||
|
|
||
| import discord | ||
| from bs4 import BeautifulSoup | ||
| from discord.ext import commands | ||
| from requests import ConnectTimeout, ConnectionError, HTTPError | ||
| from sphinx.ext import intersphinx | ||
|
|
@@ -20,7 +21,6 @@ | |
| from bot.decorators import with_role | ||
| from bot.pagination import LinePaginator | ||
| from bot.utils.messages import wait_for_deletion | ||
| from .cache import async_cache | ||
| from .parsing import get_symbol_markdown | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
@@ -55,15 +55,103 @@ | |
| class DocItem(NamedTuple): | ||
| """Holds inventory symbol information.""" | ||
|
|
||
| base_url: str | ||
| relative_url: str | ||
| package: str | ||
| group: str | ||
| base_url: str | ||
| relative_url_path: str | ||
| symbol_id: str | ||
|
|
||
| @property | ||
| def url(self) -> str: | ||
| """Return the absolute url to the symbol.""" | ||
| return self.base_url + self.relative_url | ||
| return "".join((self.base_url, self.relative_url_path, "#", self.symbol_id)) | ||
|
|
||
|
|
||
| class QueueItem(NamedTuple): | ||
| """TODO""" | ||
|
|
||
| symbol: DocItem | ||
| soup: BeautifulSoup | ||
|
|
||
| def __eq__(self, other): | ||
| if isinstance(other, DocItem): | ||
| return self.symbol == other | ||
| return NamedTuple.__eq__(self, other) | ||
|
|
||
|
|
||
| class TODO_PLACEHOLDER: | ||
| QUEUED = object() | ||
|
|
||
| def __init__(self): | ||
| self._queue: List[QueueItem] = list() | ||
| self._results = {} | ||
| self._urls = defaultdict(list) # TODO | ||
| self._item_events: Dict[DocItem, asyncio.Event] = {} | ||
| self._parse_task = None | ||
|
|
||
| async def get_item(self, client_session, doc_item: DocItem): | ||
| """todo""" | ||
| if (symbol := self._results.get(doc_item)) is not None: | ||
| return symbol | ||
|
|
||
| page_url = doc_item.base_url + doc_item.relative_url_path | ||
| if (symbols_to_queue := self._urls[page_url]) is not self.QUEUED: | ||
| async with client_session.get(page_url) as response: | ||
| soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") | ||
|
|
||
| is_parsing = bool(self._queue) | ||
| self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) | ||
| self._urls[page_url] = self.QUEUED | ||
|
|
||
| if not is_parsing: | ||
| self._parse_task = asyncio.create_task(self._parse_queue()) | ||
|
|
||
| self._move_to_front(doc_item) | ||
| self._item_events[doc_item] = item_event = asyncio.Event() | ||
| await item_event.wait() | ||
|
Numerlor marked this conversation as resolved.
|
||
| return self._results[doc_item] | ||
|
|
||
| async def _parse_queue(self): | ||
| """Parse all item from `soup` and assign their result todo""" | ||
| log.trace(f"Starting queue parsing.") | ||
| while self._queue: | ||
| await asyncio.sleep(0.1) | ||
|
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to give the event loop a bit of control so the bot can stay alive, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have no idea. Seems kind of hacky but I've seen this sort of thing done elsewhere before.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another option would be to run this in a thread executor, but then we'd need to introduce some locks to prevent potential race conditions from the queue item movement. Giving the event loop control in this way works fine since the individual parsing doesn't take that long but I agree that it feels a bit hacky and the sleep period may influence things differently on different machines There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where my concurrency knowledge fails me. I thought the separate thread would still eat all the cycles. At what point would the context switch occur if one thread is giving no breathing room and is maxing out the core?
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main thread that's running asyncio is doing lots of IO so that's getting it a lot of opportunities for the executors to acquire the GIL; for a full utilization in the executors, the first thread should send a forced request after a certain amount of time to the thread that's holding the GIL which has to release it. import threading
def cpu_and_memory_consumer():
a = []
while True:
a.append(0)
def cpu_consumer():
while True:
pass
threads = [
threading.Thread(target=cpu_consumer),
threading.Thread(target=cpu_and_memory_consumer),
#threading.Thread(target=lambda: 1234567890**1234567890),
]
for thread in threads:
thread.start()
print("started", thread)Both of the functions take all that's available to them and shouldn't voluntarily release the GIL, but when you look at the memory usage it's clearly increasing even with the I'm of course not familiar with the devops of the bot and don't know if maxing out the available cpu is a good thing or not, in the threads without any sleep periods or through a |
||
| item, soup = self._queue.pop() | ||
| self._results[item] = get_symbol_markdown(soup, item) | ||
| if (event := self._item_events.get(item)) is not None: | ||
| event.set() | ||
|
|
||
| self._parse_task = None | ||
| log.trace("Finished parsing queue.") | ||
|
|
||
| def _move_to_front(self, item): | ||
| """Move `item` to the front of the parse queue.""" | ||
| # the parse queue stores soups along with the doc symbols, | ||
| # so we first get the queue item that contains both the symbol and soup, and then move it | ||
| item_index = self._queue.index(item) | ||
| queue_item = self._queue[item_index] | ||
|
|
||
| del self._queue[item_index] | ||
| self._queue.append(queue_item) | ||
|
|
||
| def add_item(self, doc_item: DocItem): | ||
| """TODO""" | ||
| self._urls[doc_item.base_url + doc_item.relative_url_path].append(doc_item) | ||
|
|
||
| async def clear(self): | ||
| """ | ||
| Clear all internal symbol data. | ||
|
|
||
| All currently requested items are waited to be parsed before clearing. | ||
| """ | ||
| for event in self._item_events.values(): | ||
| await event.wait() | ||
| if self._parse_task is not None: | ||
| self._parse_task.cancel() | ||
| self._queue.clear() | ||
| self._results.clear() | ||
| self._urls.clear() | ||
| self._item_events.clear() | ||
|
|
||
|
|
||
| class InventoryURL(commands.Converter): | ||
|
|
@@ -106,8 +194,9 @@ def __init__(self, bot: Bot): | |
| self.bot = bot | ||
| self.doc_symbols: Dict[str, DocItem] = {} | ||
| self.renamed_symbols = set() | ||
|
|
||
| self.placeholder = TODO_PLACEHOLDER() | ||
| self.bot.loop.create_task(self.init_refresh_inventory()) | ||
| # TODO decide on objects/doc_symbols merging and what needs to be kept in doc_symbols when urls exist | ||
|
|
||
| async def init_refresh_inventory(self) -> None: | ||
| """Refresh documentation inventory on cog initialization.""" | ||
|
|
@@ -163,8 +252,11 @@ async def update_single( | |
| # Split `package_name` because of packages like Pillow that have spaces in them. | ||
| symbol = f"{api_package_name}.{symbol}" | ||
| self.renamed_symbols.add(symbol) | ||
|
|
||
| self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name) | ||
| # TODO remove comment above, clean up | ||
| relative_url_path, _, symbol_id = relative_doc_url.partition("#") | ||
| symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) | ||
| self.doc_symbols[symbol] = symbol_item | ||
| self.placeholder.add_item(symbol_item) | ||
|
|
||
| log.trace(f"Fetched inventory for {api_package_name}.") | ||
|
|
||
|
|
@@ -178,7 +270,7 @@ async def refresh_inventory(self) -> None: | |
| self.base_urls.clear() | ||
| self.doc_symbols.clear() | ||
| self.renamed_symbols.clear() | ||
| async_cache.cache = OrderedDict() | ||
| await self.placeholder.clear() | ||
|
|
||
| # Run all coroutines concurrently - since each of them performs a HTTP | ||
| # request, this speeds up fetching the inventory data heavily. | ||
|
|
@@ -198,13 +290,12 @@ async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: | |
| symbol_info = self.doc_symbols.get(symbol) | ||
| if symbol_info is None: | ||
| return None | ||
| self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") | ||
| embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info) | ||
|
|
||
| self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") | ||
| embed = discord.Embed( | ||
| title=discord.utils.escape_markdown(symbol), | ||
| url=symbol_info.url, | ||
| description=embed_description | ||
| description=await self.placeholder.get_item(self.bot.http_session, symbol_info) | ||
| ) | ||
| # Show all symbols with the same name that were renamed in the footer. | ||
| embed.set_footer( | ||
|
|
@@ -217,6 +308,11 @@ async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> N | |
| """Lookup documentation for Python symbols.""" | ||
| await ctx.invoke(self.get_command, symbol=symbol) | ||
|
|
||
| @commands.command() | ||
| async def command(self, ctx): | ||
| for symbol in ["arcade", "arcade"]: | ||
| await self.get_command(ctx, symbol=symbol) | ||
|
|
||
| @docs_group.command(name='getdoc', aliases=('g',)) | ||
| async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The functionality the changes here implement is that instead of caching beautifulsoup objects like in the current implementation python-discord#1014 (which can bubble up and potentially become a memory issue) we store the lookup ids of each symbol, and when one symbol on a page is requested all others on that page are parsed alongside it so we can throw out the BS object when that's done.