Merge pull request #162 from Lukas0907/fixes

Fixes
PyFeeds · Sep 4, 2018 · 891bee3 · 891bee3
2 parents eb2edcb + 5d2e5e3
commit 891bee3
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 7 deletions.
diff --git a/feeds/loaders.py b/feeds/loaders.py
@@ -125,6 +125,14 @@ def cleanup_html(tree, loader_context):
         for elem in tree.xpath(elem_sel):
             elem.drop_tree()
 
+    # Change attrib names.
+    for elem_sel, attribs in loader_context.get("change_attribs", {}).items():
+        selector = CSSSelector(elem_sel)
+        for elem in selector(tree):
+            for attrib in elem.attrib.keys():
+                if attrib in attribs:
+                    elem.attrib[attribs[attrib]] = elem.attrib.pop(attrib)
+
     # Change tag names.
     for elem_sel, elem_tag in loader_context.get("change_tags", {}).items():
         selector = CSSSelector(elem_sel)

diff --git a/feeds/spiders/derstandard_at.py b/feeds/spiders/derstandard_at.py
@@ -9,7 +9,11 @@
 class DerStandardAtSpider(FeedsXMLFeedSpider):
     name = "derstandard.at"
     allowed_domains = [name]
-    custom_settings = {"COOKIES_ENABLED": False}
+    custom_settings = {
+        "COOKIES_ENABLED": False,
+        # Don't filter duplicates. This would impose a race condition.
+        "DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
+    }
 
     _title = "derStandard.at"
     _subtitle = "Nachrichten in Echtzeit"
@@ -81,7 +85,13 @@ def _parse_article(self, response):
             ".continue",
             ".sequence-number",
         ]
-        change_tags = {"#media-list li": "div", "#media-list": "div"}
+        change_tags = {
+            "#media-list li .description": "figcaption",
+            "#media-list li": "figure",
+            "#media-list": "div",
+            ".photo": "figure",
+            ".caption": "figcaption",
+        }
         replace_regex = {
             # data-zoom-src is only valid if it starts with //images.derstandard.at.
             r'<img[^>]+data-zoom-src="(//images.derstandard.at/[^"]+)"': (
@@ -104,7 +114,12 @@ def _parse_article(self, response):
         )
         il.add_value("link", response.url)
         il.add_css("title", 'meta[property="og:title"]::attr(content)')
-        il.add_css("author_name", "span.author::text")
+        for author in response.css("span.author::text").extract():
+            # Sometimes the author name is messed up and written in upper case.
+            # This happens usually for articles written by Günter Traxler.
+            if author.upper() == author:
+                author = author.title()
+            il.add_value("author_name", author)
         il.add_value("path", response.meta["ressort"])
         il.add_value("updated", response.meta["updated"])
         il.add_css("category", "#breadcrumb .item a::text")

diff --git a/feeds/spiders/falter_at.py b/feeds/spiders/falter_at.py
@@ -39,7 +39,7 @@ def start_requests(self):
                     },
                     meta={
                         "dont_redirect": True,
-                        "cache_expires": timedelta(days=1),
+                        "cache_expires": timedelta(hours=3),
                         "handle_httpstatus_list": [302],
                     },
                     callback=self.request_archive,

diff --git a/feeds/spiders/generic.py b/feeds/spiders/generic.py
@@ -11,6 +11,7 @@
 
 # Readability's output is not that interesting to justify log level "INFO".
 import readability.readability
+
 readability.readability.log.info = readability.readability.log.debug
 
 

diff --git a/feeds/spiders/orf_at.py b/feeds/spiders/orf_at.py
@@ -108,7 +108,8 @@ def _parse_article(self, response):
         try:
             # Heuristic for news.ORF.at to to detect teaser articles.
             more = response.css(
-                ".shortnews p > strong:contains('Mehr') + a::attr(href)"
+                ".story-story p > strong:contains('Mehr') + a::attr(href), "
+                + ".story-story p > a:contains('Lesen Sie mehr')::attr(href)"
             ).extract_first()
             if more:
                 self.logger.debug(
@@ -132,6 +133,11 @@ def _parse_article(self, response):
             ".slideshow",
             "script",
             ".oon-youtube-logo",
+            # redesign
+            "#more-to-read-anchor",
+            ".social-buttons",
+            ".story-horizontal-ad",
+            ".linkcard",
         ]
         pullup_elems = {
             ".remote .instagram": 1,
@@ -144,6 +150,7 @@ def _parse_article(self, response):
             ".remote": "<p><em>Hinweis: Der eingebettete Inhalt ist nur im Artikel "
             + "verfügbar.</em></p>"
         }
+        change_attribs = {"img": {"data-src": "src"}}
         author, author_selector = self._extract_author(response)
         if author:
             self.logger.debug("Extracted possible author '{}'".format(author))
@@ -157,6 +164,7 @@ def _parse_article(self, response):
             remove_elems=remove_elems,
             pullup_elems=pullup_elems,
             replace_elems=replace_elems,
+            change_attribs=change_attribs,
         )
         # news.ORF.at
         data = response.css('script[type="application/ld+json"]::text').extract_first()
@@ -169,8 +177,10 @@ def _parse_article(self, response):
         il.add_value("updated", updated)
         il.add_css("title", "title::text", re="(.*) - .*")
         il.add_value("link", response.url)
-        il.add_css("content_html", ".opener img")  # fm4.ORF.at
+        il.add_css("content_html", ".opener img")  # FM4, news
+        il.add_css("content_html", ".story-lead-text")  # news
         il.add_css("content_html", "#ss-storyText")
+        il.add_css("content_html", "#ss-storyContent")  # news
         il.add_value("author_name", author)
         il.add_value("path", response.meta["path"])
         il.add_value("category", response.meta["categories"])

diff --git a/feeds/spiders/wienerlinien_at.py b/feeds/spiders/wienerlinien_at.py
@@ -33,7 +33,7 @@ def parse(self, response):
                 response=response,
                 timezone=self._timezone,
                 ignoretz=True,
-                base_url="https://{}".format(self.name),
+                base_url="https://www.{}".format(self.name),
             )
             link = response.urljoin(item.css("a::attr(href)").extract_first())
             il.add_value("link", link)