Skip to content

Commit

Permalink
Merge pull request #189 from Lukas0907/next
Browse files Browse the repository at this point in the history
Extraction fixes; derStandard.at improvements
  • Loading branch information
Lukas0907 committed Dec 13, 2018
2 parents d3b7f6c + 325a857 commit ec2f493
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 32 deletions.
6 changes: 6 additions & 0 deletions docs/spiders/derstandard.at.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ Add ``derstandard.at`` to the list of spiders:
derstandard.at supports different ressorts via the ``ressorts`` parameter
(one per line). If no ressort is given, ``seite1`` is used.

The spider also has support user postings via the ``users`` parameter
(one per line).

Example configuration:

.. code-block:: ini
Expand All @@ -27,6 +30,9 @@ Example configuration:
4748
etat
immobilien
users =
4894
571924
.. _derStandard.at: https://derstandard.at
2 changes: 2 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ useragent = feeds (+https://github.com/nblock/feeds)
# 4748
# etat
# immobilien
#users =
# 571924

#[arstechnica.com]
#channels =
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _inline_picture(elem):
(
video
for video in api_response["sources"]
if "src" in video and video["container"] == "MP4"
if "src" in video and video.get("container") == "MP4"
),
key=lambda v: v["size"],
)[-1]["src"]
Expand Down
56 changes: 55 additions & 1 deletion feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import html
from datetime import timedelta
from datetime import datetime, timedelta

import scrapy

Expand Down Expand Up @@ -36,6 +36,25 @@ def start_requests(self):
meta={"dont_cache": True, "ressort": ressort},
)

self._users = self.settings.get("FEEDS_SPIDER_DERSTANDARD_AT_USERS")
if self._users:
self._users = {user_id: None for user_id in self._users.split()}
for user_id in self._users.keys():
for page in range(3):
yield scrapy.Request(
(
"https://{}/userprofil/postings/{}?"
+ "pageNumber={}&sortMode=1"
).format(self.name, user_id, page),
self._parse_user_profile,
meta={
# Older pages should be cached longer.
"cache_expires": timedelta(hours=page),
"path": "userprofil/postings/{}".format(user_id),
"user_id": user_id,
},
)

def feed_headers(self):
for ressort in self._ressorts:
yield generate_feed_header(
Expand All @@ -48,6 +67,17 @@ def feed_headers(self):
path=ressort,
)

for user_id, name in self._users.items():
yield generate_feed_header(
title="derStandard.at › Postings von {}".format(name),
subtitle="Nachrichten in Echtzeit",
link="https://{}/userprofil/postings/{}".format(self.name, user_id),
icon="https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-16.ico",
logo="https://at.staticfiles.at/sites/mainweb/img/icons/dst/"
"dst-228.png",
path="userprofil/postings/{}".format(user_id),
)

def parse_node(self, response, node):
if response.meta["ressort"] not in self._titles:
self._titles[response.meta["ressort"]] = node.xpath(
Expand Down Expand Up @@ -161,3 +191,27 @@ def _parse_blog_article(self, response):
il = response.meta["il"]
il.add_value("content_html", response.text)
return il.load_item()

def _parse_user_profile(self, response):
self._users[response.meta["user_id"]] = (
response.css("#up_user h2::text").extract_first().strip()
)
for posting in response.css(".posting"):
il = FeedEntryItemLoader(
selector=posting,
base_url="https://{}".format(self.name),
change_tags={"span": "p"},
)
il.add_css("title", ".text strong::text")
il.add_css("link", '.text a::attr("href")')
il.add_value(
"updated",
datetime.utcfromtimestamp(
int(posting.css('.date::attr("data-timestamp")').extract_first())
/ 1000
),
)
il.add_css("content_html", ".text span")
il.add_css("content_html", ".article h4")
il.add_value("path", response.meta["path"])
yield il.load_item()
55 changes: 31 additions & 24 deletions feeds/spiders/nachrichten_at.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from collections import OrderedDict
from datetime import timedelta

import scrapy
from inline_requests import inline_requests

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
Expand All @@ -18,25 +20,19 @@ def start_requests(self):
self.logger.info("No ressorts given, falling back to general ressort!")
self._ressorts = ["nachrichten"]

username = self.settings.get("FEEDS_SPIDER_NACHRICHTEN_AT_USERNAME")
password = self.settings.get("FEEDS_SPIDER_NACHRICHTEN_AT_PASSWORD")
if username and password:
yield scrapy.FormRequest(
"https://www.{}/login/".format(self.name),
formdata=OrderedDict(
[
("user[control][login]", "true"),
("permanent", "checked"),
("username", username),
("password", password),
]
),
callback=self._after_login,
)
self._username = self.settings.get("FEEDS_SPIDER_NACHRICHTEN_AT_USERNAME")
self._password = self.settings.get("FEEDS_SPIDER_NACHRICHTEN_AT_PASSWORD")
if self._username and self._password:
yield from self._login(None)
else:
self.logger.info("Login failed: No username or password given")
# We can still try to scrape the free articles.
yield from self._after_login()
self.logger.info("Login failed: No username or password given")

for ressort in self._ressorts:
yield scrapy.Request(
"https://www.{}/storage/rss/rss/{}.xml".format(self.name, ressort),
meta={"ressort": ressort, "dont_cache": True},
)

def feed_headers(self):
for ressort in self._ressorts:
Expand All @@ -51,17 +47,28 @@ def feed_headers(self):
"touchicon_180x180.png".format(self.name),
)

def _after_login(self, response=None):
@inline_requests
def _login(self, response):
response = yield scrapy.Request(
"https://www.{}/login/".format(self.name),
meta={"cache_expires": timedelta(days=14)},
)
response = yield scrapy.FormRequest(
"https://www.{}/login/".format(self.name),
formdata=OrderedDict(
[
("user[control][login]", "true"),
("permanent", "checked"),
("username", self._username),
("password", self._password),
]
),
meta={"cache_expires": timedelta(days=14)},
)
if response and response.css(".notloggedin"):
# We tried to login but we failed.
self.logger.error("Login failed: Username or password wrong")

for ressort in self._ressorts:
yield scrapy.Request(
"https://www.{}/storage/rss/rss/{}.xml".format(self.name, ressort),
meta={"ressort": ressort, "dont_cache": True},
)

def parse_node(self, response, node):
url = node.xpath("link/text()").extract_first()
return scrapy.Request(
Expand Down
21 changes: 15 additions & 6 deletions feeds/spiders/tvthek_orf_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,21 @@ def _parse_episode(self, response):
# We use the first embedded segment instead.
# This is also how mediathekviewweb.de works.
item["sources"] = item["_embedded"]["segments"][0]["sources"]
video = next(
s
for s in item["sources"]["progressive_download"]
if s["quality_key"] == "Q8C"
)
il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
try:
video = next(
s
for s in item["sources"]["progressive_download"]
if s["quality_key"] == "Q8C"
)
il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
except StopIteration:
self.logger.error("Could not extract video for '{}'!".format(item["title"]))
subtitle = item["_embedded"].get("subtitle")
if subtitle:
subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"})
else:
self.logger.debug("No subtitle file found for '{}'".format(item["url"]))
il.add_value(
"category",
self._categories_from_oewa_base_path(
Expand Down

0 comments on commit ec2f493

Please sign in to comment.