Skip to content

Commit

Permalink
Add spider for gnucash.org
Browse files Browse the repository at this point in the history
  • Loading branch information
Florian Preinstorfer committed Jul 23, 2018
1 parent c2fe14d commit 0af56d0
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 0 deletions.
16 changes: 16 additions & 0 deletions docs/spiders/gnucash.org.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_gnucash.org:

gnucash.org
-----------
News from the `GnuCash <https://www.gnucash.org>`_ project.

Configuration
~~~~~~~~~~~~~
Add ``gnucash.org`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
gnucash.org
48 changes: 48 additions & 0 deletions feeds/spiders/gnucash_org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider


class GnucashOrgSpider(FeedsXMLFeedSpider):
name = "gnucash.org"
allowed_domains = [name]
start_urls = ["https://www.{}/atom.php".format(name)]

namespaces = [("atom", "http://www.w3.org/2005/Atom")]
iterator = "xml"
itertag = "atom:entry"

_title = "GnuCash News"
_subtitle = "GnuCash is personal and small-business financial-accounting software."
_link = "https://www.{}".format(name)
_icon = "https://www.{}/images/icons/gnc-icon-129x129.png".format(name)
_logo = "https://www.{}/externals/logo_w120.png".format(name)

def parse_node(self, response, node):
# Reuse most of the existing fields
il = FeedEntryItemLoader(selector=node, base_url=self._link)
il.add_xpath("title", "atom:title/text()")
il.add_xpath("link", "atom:link/@href")
il.add_xpath("author_name", "atom:author/atom:name/text()")
il.add_xpath("author_email", "atom:author/atom:email/text()")
il.add_xpath("updated", "atom:updated/text()")

# All news items are stored on a single page and may be referred to via
# an ID. Extract an item's id and use it to subsequently extract the
# corresponding news text.
url, news_id = node.xpath("atom:link/@href").extract_first().split("#")
yield scrapy.Request(
url, self._parse_news, dont_filter=True, meta={"news_id": news_id, "il": il}
)

def _parse_news(self, response):
il = FeedEntryItemLoader(response=response, parent=response.meta["il"])
il.add_xpath(
"content_html",
'//div[@class="newsheader" and .//a[@id="{}"]]'
'/following-sibling::div[@class="newsinner"]'.format(
response.meta["news_id"]
),
)
yield il.load_item()

0 comments on commit 0af56d0

Please sign in to comment.