Merge pull request #106 from Lukas0907/dev/next

Dev/next
PyFeeds · Sep 30, 2017 · d5b58d2 · d5b58d2
2 parents 48d5562 + 39a4cb0
commit d5b58d2
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 9 deletions.
diff --git a/docs/spiders.rst b/docs/spiders.rst
@@ -11,12 +11,12 @@ Feeds is currently able to create Atom feeds for the following websites:
    spiders/*
 
 Some sites (:ref:`Falter <spider_falter.at>`, :ref:`Konsument
-<spider_konsument.at>`, :ref:`LWN <spider_lwn.net>`, :ref:`Übermedien
-<spider_uebermedien.com>`) offer articles only behind a paywall. If you have a
-paid subscription, you can configure your username and password in
-``feeds.cfg`` and also read paywalled articles from within your feed reader.
-For the less fortunate who don't have a subscription, paywalled articles are
-tagged with ``paywalled`` so they can be filtered, if desired.
+<spider_konsument.at>`, :ref:`LWN <spider_lwn.net>`) offer articles only
+behind a paywall. If you have a paid subscription, you can configure your
+username and password in ``feeds.cfg`` and also read paywalled articles from
+within your feed reader.  For the less fortunate who don't have a
+subscription, paywalled articles are tagged with ``paywalled`` so they can be
+filtered, if desired.
 
 All feeds contain the articles in full text so you never have to leave your
 feed reader while reading.
diff --git a/docs/spiders/addendum.org.rst b/docs/spiders/addendum.org.rst
@@ -0,0 +1,16 @@
+.. _spider_addendum.org:
+
+addendum.org
+------------
+Newest articles from `Addendum <https://www.addendum.org>`_.
+
+Configuration
+~~~~~~~~~~~~~
+Add ``addendum.org`` to the list of spiders:
+
+.. code-block:: ini
+
+   # List of spiders to run by default, one per line.
+   spiders =
+     addendum.org
+
diff --git a/feeds/loaders.py b/feeds/loaders.py
@@ -1,4 +1,5 @@
 import os
+import re
 
 from lxml import etree
 from lxml.cssselect import CSSSelector
@@ -33,6 +34,13 @@ def parse_datetime(text, loader_context):
             timezone=loader_context.get('timezone', 'UTC'))
 
 
+def replace_regex(text, loader_context):
+    for pattern, repl in loader_context.get('replace_regex', {}).items():
+        text = re.sub(pattern, repl, text)
+
+    return text
+
+
 def build_tree(text, loader_context):
     base_url = loader_context.get('base_url', None)
     tree = lxml.html.fragment_fromstring(text, create_parent='div',
@@ -85,6 +93,10 @@ def cleanup_html(tree, loader_context):
         # in the feed.
         elem.attrib.pop('class', None)
         elem.attrib.pop('id', None)
+        # Delete data- attributes that have no general meaning.
+        for attrib in list(elem.attrib.keys()):
+            if attrib.startswith('data-'):
+                elem.attrib.pop(attrib)
 
     return [tree]
 
@@ -167,9 +179,10 @@ class FeedEntryItemLoader(BaseItemLoader):
     content_text_in = MapCompose(skip_false, str.strip, remove_tags)
     content_text_out = Join('\n')
 
-    content_html_in = MapCompose(skip_false, build_tree, convert_footnotes,
-                                 cleanup_html, skip_empty_tree,
-                                 make_links_absolute, serialize_tree)
+    content_html_in = MapCompose(skip_false, replace_regex, build_tree,
+                                 convert_footnotes, cleanup_html,
+                                 skip_empty_tree, make_links_absolute,
+                                 serialize_tree)
     content_html_out = Join()
 
     category_out = Identity()

diff --git a/feeds/spiders/addendum_org.py b/feeds/spiders/addendum_org.py
@@ -0,0 +1,61 @@
+import scrapy
+
+from feeds.loaders import FeedEntryItemLoader
+from feeds.spiders import FeedsSpider
+
+
+class AddendumOrgSpider(FeedsSpider):
+    name = 'addendum.org'
+    allowed_domains = [name]
+    start_urls = ['https://www.addendum.org/projekte-ubersicht/']
+
+    _title = 'Addendum'
+    _subtitle = 'das, was fehlt'
+    _link = 'https://www.{}'.format(name)
+    _icon = ('https://www.{}/resources/dist/favicons/'
+             'android-chrome-192x192.png').format(name)
+    _timezone = 'Europe/Vienna'
+
+    def parse(self, response):
+        url = response.css('section::attr(data-url-project)').extract_first()
+        yield scrapy.Request(url, self.parse_item, meta={'dont_cache': True})
+
+    def parse_item(self, response):
+        # First URL is the overview page.
+        for url in (
+                response.css('.projectNav__meta a::attr(href)').extract()[1:]):
+            yield scrapy.Request(url, self._parse_article)
+
+    def _parse_article(self, response):
+        remove_elems = [
+            '.projectNav', 'h1', '.socialMedia__headline', '.whyRead',
+            '.overlayCTA', '.authors', '.socialMedia', '.sidebar',
+            '.sectionBackground--colorTheme1', '.heroStage__copyright',
+            '.heroStage__downLink', 'script', 'iframe', '.image__zoom ',
+            '.image__copyrightWrapper'
+        ]
+        change_tags = {
+            'div.heroStage__introText': 'strong',
+            'figcaption': 'i',
+            'figure': 'div'
+        }
+        replace_regex = {
+            r'<span data-src="([^"]+)"></span>.*?' +
+            r'<span data-src="([^"]+)" data-min-width="1000">':
+            r'<a href="\2"><img src="\1"></a>',
+            r'<div style=".*?"><video.*?></video>.*?</div></div>':
+            '<em>Das eingebettete Video ist nur im Artikel verfügbar.</em>',
+        }
+        il = FeedEntryItemLoader(response=response,
+                                 timezone=self._timezone,
+                                 base_url='https://www.{}'.format(self.name),
+                                 remove_elems=remove_elems,
+                                 change_tags=change_tags,
+                                 replace_regex=replace_regex)
+        il.add_value('link', response.url)
+        il.add_value('author_name', 'Addendum')
+        il.add_css('title', 'meta[property="og:title"]::attr(content)')
+        il.add_css('updated',
+                   'meta[property="og:updated_time"]::attr(content)')
+        il.add_css('content_html', '.content')
+        yield il.load_item()