Skip to content

Commit

Permalink
Merge pull request #205 from Lukas0907/next
Browse files Browse the repository at this point in the history
Fix flatten tree, add spider for Wiener Zeitung
  • Loading branch information
Lukas0907 committed Jul 23, 2019
2 parents 754a095 + b287b21 commit 8cdcbc1
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 0 deletions.
31 changes: 31 additions & 0 deletions docs/spiders/wienerzeitung.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
.. _spider_wienerzeitung.at:

wienerzeitung.at
--------------
Newest articles from `Wiener Zeitung`_.

Configuration
~~~~~~~~~~~~~
Add ``wienerzeitung.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
wienerzeitung.at
wienerzeitung.at supports different ressorts via the ``ressorts`` parameter
(one per line).

Example configuration:

.. code-block:: ini
[wienerzeitung.at]
ressorts =
nachrichten/politik/wien
nachrichten/politik
nachrichten/wirtschaft
meinung
.. _`Wiener Zeitung`: https://www.wienerzeitung.at
7 changes: 7 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,10 @@ useragent = feeds (+https://github.com/pyfeeds/pyfeeds)
#[spotify.com]
#shows =
# 6u7pI0o0CUBQq0T1fwPgbj

#[wienerzeitung.at]
#ressorts =
# nachrichten/politik/wien
# nachrichten/politik
# nachrichten/wirtschaft
# meinung
1 change: 1 addition & 0 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def flatten_tree(tree):
return None
elif (
only_child is not None
and (tree.text is None or tree.text.strip() == "")
and only_child.tag == tree.tag
and tree.getparent() is not None
):
Expand Down
108 changes: 108 additions & 0 deletions feeds/spiders/wienerzeitung_at.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from urllib.parse import urlparse

import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
from feeds.utils import generate_feed_header


class WienerZeitungAtSpider(FeedsSpider):
name = "wienerzeitung.at"

_titles = {}
_ressorts = set()

def start_requests(self):
ressorts = self.settings.get("FEEDS_SPIDER_WIENERZEITUNG_AT_RESSORTS")
if ressorts:
self._ressorts = set(ressorts.split())
else:
self.logger.error("No ressorts given!")
return

for ressort in self._ressorts:
yield scrapy.Request(
"https://www.{}/{}".format(self.name, ressort),
meta={"dont_cache": True, "ressort": ressort},
)

def feed_headers(self):
for ressort in self._ressorts:
yield generate_feed_header(
title="Wiener Zeitung › {}".format(self._titles.get(ressort, ressort)),
link="https://www.{}".format(self.name),
icon="https://www.{}/_em_daten/wzo/favicon.ico".format(self.name),
logo="https://www.{}/_em_daten/wzo/_layout/logo_rss.png".format(
self.name
),
path=ressort,
)

def parse(self, response):
for link in response.css(
".topnews-headline::attr(href), .card-title::attr(href)"
).extract():
yield scrapy.Request(
link + "?em_no_split=1",
self._parse_article,
meta={"ressort": response.meta["ressort"]},
)

def _parse_article(self, response):
def _fix_img_src(elem):
if "data-src-retina" in elem.attrib:
elem.attrib["src"] = elem.attrib["data-src-retina"]
elif "data-src" in elem.attrib:
elem.attrib["src"] = elem.attrib["data-src"]
return elem

def _parse_breadcrumbs(breadcrumbs):
links = breadcrumbs.css("a::text, a::attr('href')").extract()
# Skip first and last "/" in URL; skip "Startseite" in breadcrumbs.
return {urlparse(k).path[1:-1]: v for k, v in zip(links[2::2], links[3::2])}

breadcrumbs = _parse_breadcrumbs(response.css(".breadcrumb a"))
self._titles = {**self._titles, **breadcrumbs}

remove_elems = [
"noscript",
"h1",
".article-meta",
".article-header > span.tag",
".figure-copyright",
".new-pictures",
".author-item",
".related-articles",
'div[data-type="advert"]',
".hidden",
".article-keywords",
".caption-socials",
".caption-text > small.d-block",
]
change_tags = {
".article-subtitle": "strong",
"aside": "blockquote",
"span[style='font-weight: bold;']": "strong",
"span[style='font-style: italic;']": "em",
".container-inner": "blockquote",
}
replace_elems = {"img": _fix_img_src}
il = FeedEntryItemLoader(
response=response,
base_url="https://www.{}".format(self.name),
remove_elems=remove_elems,
change_tags=change_tags,
replace_elems=replace_elems,
timezone="Europe/Vienna",
)
il.add_value("link", response.url)
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("author_name", ".author-headline ::text")
il.add_value("path", response.meta["ressort"])
il.add_value("category", breadcrumbs.values())
il.add_css("category", ".article-keywords li ::text")
il.add_css("updated", ".article-updated time::attr('datetime')")
il.add_css("updated", ".article-published time::attr('datetime')")
il.add_css("content_html", "article")
return il.load_item()

0 comments on commit 8cdcbc1

Please sign in to comment.