Add spider for gnucash.org

PyFeeds · Jul 23, 2018 · 0af56d0 · 0af56d0
1 parent c2fe14d
commit 0af56d0
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 0 deletions.
diff --git a/docs/spiders/gnucash.org.rst b/docs/spiders/gnucash.org.rst
@@ -0,0 +1,16 @@
+.. _spider_gnucash.org:
+
+gnucash.org
+-----------
+News from the `GnuCash <https://www.gnucash.org>`_ project.
+
+Configuration
+~~~~~~~~~~~~~
+Add ``gnucash.org`` to the list of spiders:
+
+.. code-block:: ini
+
+   # List of spiders to run by default, one per line.
+   spiders =
+     gnucash.org
+
diff --git a/feeds/spiders/gnucash_org.py b/feeds/spiders/gnucash_org.py
@@ -0,0 +1,48 @@
+import scrapy
+
+from feeds.loaders import FeedEntryItemLoader
+from feeds.spiders import FeedsXMLFeedSpider
+
+
+class GnucashOrgSpider(FeedsXMLFeedSpider):
+    name = "gnucash.org"
+    allowed_domains = [name]
+    start_urls = ["https://www.{}/atom.php".format(name)]
+
+    namespaces = [("atom", "http://www.w3.org/2005/Atom")]
+    iterator = "xml"
+    itertag = "atom:entry"
+
+    _title = "GnuCash News"
+    _subtitle = "GnuCash is personal and small-business financial-accounting software."
+    _link = "https://www.{}".format(name)
+    _icon = "https://www.{}/images/icons/gnc-icon-129x129.png".format(name)
+    _logo = "https://www.{}/externals/logo_w120.png".format(name)
+
+    def parse_node(self, response, node):
+        # Reuse most of the existing fields
+        il = FeedEntryItemLoader(selector=node, base_url=self._link)
+        il.add_xpath("title", "atom:title/text()")
+        il.add_xpath("link", "atom:link/@href")
+        il.add_xpath("author_name", "atom:author/atom:name/text()")
+        il.add_xpath("author_email", "atom:author/atom:email/text()")
+        il.add_xpath("updated", "atom:updated/text()")
+
+        # All news items are stored on a single page and may be referred to via
+        # an ID. Extract an item's id and use it to subsequently extract the
+        # corresponding news text.
+        url, news_id = node.xpath("atom:link/@href").extract_first().split("#")
+        yield scrapy.Request(
+            url, self._parse_news, dont_filter=True, meta={"news_id": news_id, "il": il}
+        )
+
+    def _parse_news(self, response):
+        il = FeedEntryItemLoader(response=response, parent=response.meta["il"])
+        il.add_xpath(
+            "content_html",
+            '//div[@class="newsheader" and .//a[@id="{}"]]'
+            '/following-sibling::div[@class="newsinner"]'.format(
+                response.meta["news_id"]
+            ),
+        )
+        yield il.load_item()