Skip to content

Commit

Permalink
Merge pull request #106 from Lukas0907/dev/next
Browse files Browse the repository at this point in the history
Dev/next
  • Loading branch information
nblock committed Sep 30, 2017
2 parents 48d5562 + 39a4cb0 commit d5b58d2
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 9 deletions.
12 changes: 6 additions & 6 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ Feeds is currently able to create Atom feeds for the following websites:
spiders/*

Some sites (:ref:`Falter <spider_falter.at>`, :ref:`Konsument
<spider_konsument.at>`, :ref:`LWN <spider_lwn.net>`, :ref:`Übermedien
<spider_uebermedien.com>`) offer articles only behind a paywall. If you have a
paid subscription, you can configure your username and password in
``feeds.cfg`` and also read paywalled articles from within your feed reader.
For the less fortunate who don't have a subscription, paywalled articles are
tagged with ``paywalled`` so they can be filtered, if desired.
<spider_konsument.at>`, :ref:`LWN <spider_lwn.net>`) offer articles only
behind a paywall. If you have a paid subscription, you can configure your
username and password in ``feeds.cfg`` and also read paywalled articles from
within your feed reader. For the less fortunate who don't have a
subscription, paywalled articles are tagged with ``paywalled`` so they can be
filtered, if desired.

All feeds contain the articles in full text so you never have to leave your
feed reader while reading.
16 changes: 16 additions & 0 deletions docs/spiders/addendum.org.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_addendum.org:

addendum.org
------------
Newest articles from `Addendum <https://www.addendum.org>`_.

Configuration
~~~~~~~~~~~~~
Add ``addendum.org`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
addendum.org
19 changes: 16 additions & 3 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re

from lxml import etree
from lxml.cssselect import CSSSelector
Expand Down Expand Up @@ -33,6 +34,13 @@ def parse_datetime(text, loader_context):
timezone=loader_context.get('timezone', 'UTC'))


def replace_regex(text, loader_context):
for pattern, repl in loader_context.get('replace_regex', {}).items():
text = re.sub(pattern, repl, text)

return text


def build_tree(text, loader_context):
base_url = loader_context.get('base_url', None)
tree = lxml.html.fragment_fromstring(text, create_parent='div',
Expand Down Expand Up @@ -85,6 +93,10 @@ def cleanup_html(tree, loader_context):
# in the feed.
elem.attrib.pop('class', None)
elem.attrib.pop('id', None)
# Delete data- attributes that have no general meaning.
for attrib in list(elem.attrib.keys()):
if attrib.startswith('data-'):
elem.attrib.pop(attrib)

return [tree]

Expand Down Expand Up @@ -167,9 +179,10 @@ class FeedEntryItemLoader(BaseItemLoader):
content_text_in = MapCompose(skip_false, str.strip, remove_tags)
content_text_out = Join('\n')

content_html_in = MapCompose(skip_false, build_tree, convert_footnotes,
cleanup_html, skip_empty_tree,
make_links_absolute, serialize_tree)
content_html_in = MapCompose(skip_false, replace_regex, build_tree,
convert_footnotes, cleanup_html,
skip_empty_tree, make_links_absolute,
serialize_tree)
content_html_out = Join()

category_out = Identity()
Expand Down
61 changes: 61 additions & 0 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider


class AddendumOrgSpider(FeedsSpider):
name = 'addendum.org'
allowed_domains = [name]
start_urls = ['https://www.addendum.org/projekte-ubersicht/']

_title = 'Addendum'
_subtitle = 'das, was fehlt'
_link = 'https://www.{}'.format(name)
_icon = ('https://www.{}/resources/dist/favicons/'
'android-chrome-192x192.png').format(name)
_timezone = 'Europe/Vienna'

def parse(self, response):
url = response.css('section::attr(data-url-project)').extract_first()
yield scrapy.Request(url, self.parse_item, meta={'dont_cache': True})

def parse_item(self, response):
# First URL is the overview page.
for url in (
response.css('.projectNav__meta a::attr(href)').extract()[1:]):
yield scrapy.Request(url, self._parse_article)

def _parse_article(self, response):
remove_elems = [
'.projectNav', 'h1', '.socialMedia__headline', '.whyRead',
'.overlayCTA', '.authors', '.socialMedia', '.sidebar',
'.sectionBackground--colorTheme1', '.heroStage__copyright',
'.heroStage__downLink', 'script', 'iframe', '.image__zoom ',
'.image__copyrightWrapper'
]
change_tags = {
'div.heroStage__introText': 'strong',
'figcaption': 'i',
'figure': 'div'
}
replace_regex = {
r'<span data-src="([^"]+)"></span>.*?' +
r'<span data-src="([^"]+)" data-min-width="1000">':
r'<a href="\2"><img src="\1"></a>',
r'<div style=".*?"><video.*?></video>.*?</div></div>':
'<em>Das eingebettete Video ist nur im Artikel verfügbar.</em>',
}
il = FeedEntryItemLoader(response=response,
timezone=self._timezone,
base_url='https://www.{}'.format(self.name),
remove_elems=remove_elems,
change_tags=change_tags,
replace_regex=replace_regex)
il.add_value('link', response.url)
il.add_value('author_name', 'Addendum')
il.add_css('title', 'meta[property="og:title"]::attr(content)')
il.add_css('updated',
'meta[property="og:updated_time"]::attr(content)')
il.add_css('content_html', '.content')
yield il.load_item()

0 comments on commit d5b58d2

Please sign in to comment.