Skip to content

Commit

Permalink
Merge pull request #139 from Lukas0907/derstandard
Browse files Browse the repository at this point in the history
derStandard.at: Add spider.
  • Loading branch information
Lukas0907 committed Jul 29, 2018
2 parents a9277cb + f0b4c92 commit ff1d2d3
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 0 deletions.
32 changes: 32 additions & 0 deletions docs/spiders/derstandard.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
.. _spider_derstandard.at:

derstandard.at
--------------
Newest articles from derStandard.at_.

Configuration
~~~~~~~~~~~~~
Add ``derstandard.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
derstandard.at
derstandard.at supports different ressorts via the ``ressorts`` parameter
(one per line). If no ressort is given, ``seite1`` is used.

Example configuration:

.. code-block:: ini
[derstandard.at]
ressorts =
47
4748
etat
immobilien
.. _derStandard.at: https://derstandard.at
7 changes: 7 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,10 @@ useragent = feeds (+https://github.com/nblock/feeds)
# vorarlberg
# tirol
# religion

#[derstandard.at]
#ressorts =
# 47
# 4748
# etat
# immobilien
115 changes: 115 additions & 0 deletions feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider


class DerStandardAtSpider(FeedsXMLFeedSpider):
name = "derstandard.at"
allowed_domains = [name]

_title = "derStandard.at"
_subtitle = "Nachrichten in Echtzeit"
_link = "https://{}".format(name)
_icon = "https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico"
_logo = "https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-228.png"
_titles = {}
# Some ressorts have articles that are regulary updated, e.g. cartoons.
_ressorts_uncached = ["47"]
_max_articles = 10
_ressorts_num_articles = {}

def start_requests(self):
self._ressorts = self.settings.get("FEEDS_SPIDER_DERSTANDARD_AT_RESSORTS")
if self._ressorts:
self._ressorts = self._ressorts.split()
else:
self.logger.info("No ressorts given, falling back to general ressort!")
self._ressorts = ["seite1"]

for ressort in self._ressorts:
if str.isnumeric(ressort):
param = "ressortid={}".format(ressort)
else:
param = "ressort={}".format(ressort)
yield scrapy.Request(
"https://{}/?page=rss&{}".format(self.name, param),
meta={"dont_cache": True, "ressort": ressort},
)

def feed_headers(self):
for ressort in self._ressorts:
yield self.generate_feed_header(title=self._titles[ressort], path=ressort)

def parse_node(self, response, node):
if response.meta["ressort"] not in self._titles:
self._titles[response.meta["ressort"]] = node.xpath(
"//channel/title/text()"
).extract_first()

url = node.xpath("link/text()").extract_first()
if url.startswith("https://{}/jetzt/livebericht".format(self.name)):
return

num_articles = self._ressorts_num_articles.get(response.meta["ressort"], 0)
if num_articles >= self._max_articles:
return
self._ressorts_num_articles[response.meta["ressort"]] = num_articles + 1

updated = node.xpath("pubDate/text()").extract_first()
dont_cache = response.meta["ressort"] in self._ressorts_uncached
yield scrapy.Request(
url,
self._parse_article,
meta={
"updated": updated,
"ressort": response.meta["ressort"],
"dont_cache": dont_cache,
},
cookies={"DSGVO_ZUSAGE_V1": "true"},
)

def _parse_article(self, response):
remove_elems = [
".credits",
".owner-info",
".image-zoom",
".continue",
".sequence-number",
]
change_tags = {"#media-list li": "div", "#media-list": "div"}
replace_regex = {r'<img[^>]+data-zoom-src="([^"]+)"': r'<img src="\1"'}
replace_elems = {
".embedded-posting": "<p><em>Hinweis: Das eingebettete Posting ist nur "
+ "im Artikel verfügbar.</em></p>"
}
il = FeedEntryItemLoader(
response=response,
base_url="https://{}".format(self.name),
remove_elems=remove_elems,
change_tags=change_tags,
replace_regex=replace_regex,
replace_elems=replace_elems,
)
il.add_value("link", response.url)
il.add_css("title", 'meta[property="og:title"]::attr(content)')
il.add_css("author_name", "span.author::text")
il.add_value("path", response.meta["ressort"])
il.add_value("updated", response.meta["updated"])
blog_id = response.css("#userblogentry::attr(data-objectid)").extract_first()
if blog_id:
url = (
"https://{}/userprofil/bloggingdelivery/blogeintrag?godotid={}"
).format(self.name, blog_id)
yield scrapy.Request(url, self._parse_blog_article, meta={"il": il})
else:
il.add_css("content_html", "#content-aside")
il.add_css("content_html", "#objectContent > .copytext")
il.add_css("content_html", "#content-main > .copytext")
il.add_css("content_html", ".slide")
yield il.load_item()

def _parse_blog_article(self, response):
il = response.meta["il"]
il.add_value("content_html", response.text)
yield il.load_item()

0 comments on commit ff1d2d3

Please sign in to comment.