Merge pull request #182 from Lukas0907/next

diff --git a/docs/spiders/orf.at.rst b/docs/spiders/orf.at.rst
@@ -15,7 +15,10 @@ Add ``orf.at`` to the list of spiders:
      orf.at
 
 orf.at supports different channels via the ``channels`` parameter (one per
-line). If no channel is given, ``news`` is used.
+line). If no channel is given, ``news`` is used. It also possible to give
+a list of authors for which feeds will then be generated. Note that the
+ressort in which the author writes still has to be included in the ressorts
+parameter.
 
 .. code-block:: ini
 
@@ -38,3 +41,5 @@ line). If no channel is given, ``news`` is used.
      tirol
      vorarlberg
      wien
+   authors =
+     Erich Moechel
diff --git a/feeds.cfg.dist b/feeds.cfg.dist
@@ -112,6 +112,8 @@ useragent = feeds (+https://github.com/nblock/feeds)
 #    vorarlberg
 #    tirol
 #    religion
+#authors =
+#    Erich Moechel
 
 #[derstandard.at]
 #ressorts =

diff --git a/feeds/exporters.py b/feeds/exporters.py
@@ -8,6 +8,8 @@
 
 from feeds.items import FeedEntryItem, FeedItem
 
+logger = logging.getLogger(__name__)
+
 
 class AtomExporter(BaseItemExporter):
     class AtomFeed(object):
@@ -19,6 +21,7 @@ def __init__(self, exporter, link_self=None):
             self._xml = etree.Element(
                 "feed", nsmap={None: "http://www.w3.org/2005/Atom"}
             )
+            self._ids = set()
 
         def add_item(self, item):
             if isinstance(item, FeedItem):
@@ -27,10 +30,16 @@ def add_item(self, item):
                 for child in self._convert_feed_item(item):
                     self._xml.insert(0, child)
             elif isinstance(item, FeedEntryItem):
-                entry = etree.Element("entry")
-                for child in self._convert_feed_item(item):
-                    entry.append(child)
-                self._feed_items.append(entry)
+                if item["id"] not in self._ids:
+                    self._ids.add(item["id"])
+                    entry = etree.Element("entry")
+                    for child in self._convert_feed_item(item):
+                        entry.append(child)
+                    self._feed_items.append(entry)
+                else:
+                    logger.debug(
+                        "Feed entry with id '{}' already in feed.".format(item["id"])
+                    )
 
         def insert_updated(self):
             child = etree.Element("updated")
@@ -165,14 +174,13 @@ def __init__(self, output_path, output_url, name, **kwargs):
         self._name = name
         self._feeds = {}
         self._pretty_print = kwargs.pop("pretty_print", True)
-        self._logger = logging.getLogger(__name__)
 
     def finish_exporting(self):
         for path, feed in self._feeds.items():
             path = os.path.join(self._output_path, path)
             os.makedirs(os.path.dirname(path), exist_ok=True)
             if len(feed) == 0:
-                self._logger.warning("Feed {} contains no items!".format(path))
+                logger.warning("Feed '{}' contains no items!".format(path))
                 try:
                     os.remove(path)
                 except OSError:

diff --git a/feeds/loaders.py b/feeds/loaders.py
@@ -248,6 +248,14 @@ def skip_empty_tree(tree):
     return None
 
 
+def skip_none(value):
+    """Skip values that are None immediately."""
+    if value is not None:
+        return value
+
+    return None
+
+
 def skip_false(value):
     """
     Skip values that evaluate to False.
@@ -323,7 +331,7 @@ class BaseItemLoader(ItemLoader):
     # Defaults
     # Unescape twice to get rid of &amp;&xxx; encoding errors.
     default_input_processor = MapCompose(
-        str.strip, skip_false, html.unescape, html.unescape
+        skip_none, str.strip, skip_false, html.unescape, html.unescape
     )
     default_output_processor = TakeFirst()
 

diff --git a/feeds/spiders/biblioweb_at.py b/feeds/spiders/biblioweb_at.py
@@ -6,6 +6,7 @@
 
 class BibliowebAtSpider(FeedsSpider):
     name = "biblioweb.at"
+    custom_settings = {"DUPEFILTER_CLASS": "scrapy.dupefilters.RFPDupeFilter"}
 
     _days = 60
 
@@ -44,6 +45,8 @@ def parse(self, response):
 
     def parse_overview_page(self, response):
         # Find other pages
+        # Note that the dupefilter has to be enabled, otherwise already
+        # parsed pages will be parsed again.
         for href in response.xpath(
             '//div[@id="p_main"][1]/div/a/div[@id!="p_aktuell"]/../@href'
         ):

diff --git a/feeds/spiders/orf_at.py b/feeds/spiders/orf_at.py
@@ -65,6 +65,14 @@ def start_requests(self):
 
         self._channels = channels
 
+        self._authors = [
+            author
+            for author in (
+                self.settings.get("FEEDS_SPIDER_ORF_AT_AUTHORS", "").split("\n")
+            )
+            if author
+        ]
+
     def feed_headers(self):
         for channel in self._channels:
             channel_url = "{}.ORF.at".format(channel)
@@ -75,6 +83,9 @@ def feed_headers(self):
                 logo=self._get_logo(channel),
             )
 
+        for author in self._authors:
+            yield generate_feed_header(title="ORF.at: {}".format(author), path=author)
+
     def parse_node(self, response, node):
         categories = [
             node.xpath("orfon:storyType/@rdf:resource").re_first("urn:orfon:type:(.*)"),
@@ -197,13 +208,15 @@ def _parse_article(self, response):
             # other
             updated = response.meta["updated"]
         il.add_value("updated", updated)
-        il.add_css("title", "title::text", re="(.*) - .*")
+        il.add_css("title", "title::text", re=re.compile(r"(.*) - .*", flags=re.S))
         il.add_value("link", response.url)
         il.add_css("content_html", ".opener img")  # FM4, news
         il.add_css("content_html", ".story-lead-text")  # news
         il.add_css("content_html", "#ss-storyText")
         il.add_css("content_html", "#ss-storyContent")  # news
         il.add_value("author_name", author)
+        if author in self._authors:
+            il.add_value("path", author)
         il.add_value("path", response.meta["path"])
         il.add_value("category", response.meta["categories"])
         yield il.load_item()
@@ -242,11 +255,11 @@ def _extract_author(response):
             )
             author_selector = "#ss-storyText > .socialButtons + p"
             if author:
-                return (author, author_selector)
+                return (author.strip(), author_selector)
         elif domain == "orf.at":
             author = response.css(".byline ::text").extract_first()
             if author:
-                return (re.split(r"[/,]", author)[0], ".byline")
+                return (re.split(r"[/,]", author)[0].strip(), ".byline")
         elif domain in ["science.orf.at", "help.orf.at", "religion.orf.at"]:
             try:
                 author = (
@@ -259,7 +272,7 @@ def _extract_author(response):
                     # Only take the author name before ",".
                     author = re.split(r"[/,]", author)[0]
                     return (
-                        author,
+                        author.strip(),
                         (
                             "#ss-storyText > p:not(.date):not(.toplink):"
                             + "contains('{}')"

diff --git a/feeds/spiders/tvthek_orf_at.py b/feeds/spiders/tvthek_orf_at.py
@@ -42,6 +42,11 @@ def parse(self, response):
             )
 
         for item in json_response["_embedded"]["items"]:
+            # Skip incomplete items or items with active youth protection.
+            # We want to have working download links in the feed item.
+            if not item["segments_complete"] or item["has_active_youth_protection"]:
+                continue
+
             # We scrape the episode itself so we can get the segments which are not
             # embedded in the schedule response.
             # Furthermore since this request will be cached, the download URL will also
@@ -78,12 +83,6 @@ def _parse_episode(self, response):
             if s["quality_key"] == "Q8C"
         )
         il.add_value("enclosure", {"iri": video["src"], "type": "video/mp4"})
-        subtitle = item["_embedded"].get("subtitle")
-        if subtitle:
-            subtitle = subtitle["_embedded"]["srt_file"]["public_urls"]["reference"]
-            il.add_value("enclosure", {"iri": subtitle["url"], "type": "text/plain"})
-        else:
-            self.logger.debug("No subtitle file found for '{}'".format(item["url"]))
         il.add_value(
             "category",
             self._categories_from_oewa_base_path(

diff --git a/feeds/spiders/uebermedien_de.py b/feeds/spiders/uebermedien_de.py
@@ -5,6 +5,7 @@
 
 import scrapy
 from scrapy.http import FormRequest
+from inline_requests import inline_requests
 
 from feeds.loaders import FeedEntryItemLoader
 from feeds.spiders import FeedsXMLFeedSpider
@@ -23,31 +24,32 @@ def start_requests(self):
         self._username = self.settings.get("FEEDS_SPIDER_UEBERMEDIEN_DE_USERNAME")
         self._password = self.settings.get("FEEDS_SPIDER_UEBERMEDIEN_DE_PASSWORD")
         if self._username and self._password:
-            yield scrapy.Request(
-                "https://steadyhq.com/en/oauth/authorize?"
-                + "client_id=0c29f006-1a98-48f1-8a63-2c0652c59f28&"
-                + "redirect_uri=https://uebermedien.de&scope=read&"
-                + "response_type=code&refresh_only=false",
-                callback=self._steady_login,
-                meta={"cache_expires": timedelta(days=1)},
-            )
+            yield from self._steady_login(None)
         else:
-            self.logger.info("Login failed: No username or password given")
             # We can still try to scrape the free articles.
-            yield from super().start_requests()
+            self.logger.info("Login failed: No username or password given")
 
+        yield from super().start_requests()
+
+    @inline_requests
     def _steady_login(self, response):
-        return FormRequest.from_response(
+        response = yield scrapy.Request(
+            "https://steadyhq.com/oauth/authorize?"
+            + "client_id=0c29f006-1a98-48f1-8a63-2c0652c59f28&"
+            + "redirect_uri=https://uebermedien.de&scope=read&"
+            + "response_type=code&refresh_only=false",
+            meta={"cache_expires": timedelta(days=1)},
+        )
+
+        response = yield FormRequest.from_response(
             response,
             formdata=OrderedDict(
                 [("user[email]", self._username), ("user[password]", self._password)]
             ),
-            callback=self._request_steady_token,
             dont_filter=True,
             meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},
         )
 
-    def _request_steady_token(self, response):
         try:
             code = parse_qs(urlparse(response.url).query)["code"][0]
         except KeyError:
@@ -62,18 +64,14 @@ def _request_steady_token(self, response):
                 ("redirect_uri", "https://uebermedien.de"),
             ]
         )
-        return scrapy.Request(
+        response = yield scrapy.Request(
             "https://steadyhq.com/api/v1/oauth/token",
             method="POST",
             body=json.dumps(body),
             headers={"Accept": "application/json", "Content-Type": "application/json"},
-            callback=self._set_steady_token,
             meta={"cache_expires": timedelta(days=1)},
         )
-
-    def _set_steady_token(self, response):
         self._steady_token = json.loads(response.text)["access_token"]
-        return super().start_requests()
 
     def parse_node(self, response, node):
         il = FeedEntryItemLoader(
@@ -84,8 +82,7 @@ def parse_node(self, response, node):
         il.add_value("category", node.xpath("//category/text()").extract())
         title = node.xpath("(//title)[2]/text()").extract()
         if not title:
-            # Fallback to the first category if no title is provided
-            # (e.g. comic).
+            # Fallback to the first category if no title is provided (e.g. comic).
             title = node.xpath("//category/text()").extract_first()
         il.add_value("title", title)
         link = node.xpath("(//link)[2]/text()").extract_first()