Merge pull request #223 from Lukas0907/next

Kurier/Profil fixes
PyFeeds · Jul 25, 2020 · 0d1cec6 · 0d1cec6
2 parents 7abfb6c + e4b31e4
commit 0d1cec6
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 139 deletions.
diff --git a/feeds/loaders.py b/feeds/loaders.py
@@ -11,10 +11,10 @@
 import lxml
 from dateutil.parser import parse as dateutil_parse
 from dateutil.tz import gettz
+from itemloaders.processors import Compose, Identity, Join, MapCompose, TakeFirst
 from lxml.cssselect import CSSSelector
 from lxml.html.clean import Cleaner
 from scrapy.loader import ItemLoader
-from scrapy.loader.processors import Compose, Identity, Join, MapCompose, TakeFirst
 from w3lib.html import remove_tags
 
 from feeds.items import FeedEntryItem, FeedItem

diff --git a/feeds/spiders/kurier_at.py b/feeds/spiders/kurier_at.py
@@ -8,6 +8,82 @@
 from feeds.utils import generate_feed_header
 
 
+def parse_article(response):
+    article = json.loads(response.text)["layout"]["center"][0]
+    il = FeedEntryItemLoader()
+    il.add_value(
+        "link", urljoin("https://{}".format(article["portal"]), article["url"])
+    )
+    il.add_value("title", article["title"])
+    if "teaser_img" in article:
+        il.add_value(
+            "content_html",
+            _create_figure(
+                article["portal"],
+                article["teaser_img"]["url"],
+                article["teaser_img"].get("description"),
+            ),
+        )
+    il.add_value(
+        "content_html", "<p><strong>{}</strong></p>".format(article["teaser_text"])
+    )
+    for paragraph in article["paragraphs"]:
+        if paragraph["type"] == "text":
+            il.add_value("content_html", paragraph["data"]["html"])
+        elif paragraph["type"] == "youtube":
+            url = "https://www.youtube.com/watch?v={}".format(
+                paragraph["data"]["videoid"]
+            )
+            il.add_value(
+                "content_html", '<div><a href="{url}">{url}</a></div>'.format(url=url),
+            )
+        elif paragraph["type"] == "image":
+            il.add_value(
+                "content_html",
+                _create_figure(
+                    article["portal"],
+                    paragraph["data"]["url"].replace("large", "original"),
+                    paragraph["data"].get("description"),
+                ),
+            )
+        elif paragraph["type"] == "gallery":
+            # Only include 1 image (the latest) if the feed type is article.
+            # This is is a special case for comic articles where a new image is
+            # added to the article once a day and it doesn't make sense to always
+            # include all the old ones in the feed.
+            max_images = 1 if response.meta["feed_type"] == "article" else None
+            for image in paragraph["data"]["images"][:max_images]:
+                il.add_value(
+                    "content_html",
+                    _create_figure(
+                        article["portal"],
+                        image["url"].replace("large", "original"),
+                        image.get("description"),
+                    ),
+                )
+    il.add_value("updated", article["updated_date"])
+    for author in article["authors"]:
+        il.add_value("author_name", "{firstname} {lastname}".format(**author))
+    if not article["authors"]:
+        il.add_value("author_name", article["agency"])
+    il.add_value("category", article["channel"]["name"])
+    il.add_value("category", article["portal"])
+    if "path" in response.meta:
+        il.add_value("path", response.meta["path"])
+    if article["sponsored"]:
+        il.add_value("category", "sponsored")
+    il.add_value("category", article.get("pretitle"))
+    return il.load_item()
+
+
+def _create_figure(name, src, caption=None):
+    src = urljoin("https://image.{}".format(name), src)
+    return (
+        '<figure><div><img src="{src}"></div>'
+        + "<figcaption>{caption}</figcaption></figure>"
+    ).format(src=src, caption=caption or "")
+
+
 class KurierAtSpider(FeedsSpider):
     name = "kurier.at"
 
@@ -52,7 +128,7 @@ def start_requests(self):
                 "https://efs.kurier.at/api/v1/cfs/route?uri=/kurierat{}".format(
                     article
                 ),
-                self._parse_article,
+                parse_article,
                 meta={"path": article, "dont_cache": True, "feed_type": "article"},
             )
 
@@ -85,87 +161,17 @@ def _parse_channel(self, response):
     def _parse_collection(self, response):
         articles = json.loads(response.text)["items"]
         for article in articles:
-            yield scrapy.Request(
-                "https://efs.kurier.at/api/v1/cfs/route?uri=/{}{}".format(
-                    article["portal"].replace(".", ""), article["url"]
-                ),
-                self._parse_article,
-                meta={
-                    "path": response.meta["path"],
-                    "feed_type": response.meta["feed_type"],
-                },
-            )
-
-    def _create_figure(self, src, caption=None):
-        src = urljoin("https://image.{}".format(self.name), src)
-        return (
-            '<figure><div><img src="{src}"></div>'
-            + "<figcaption>{caption}</figcaption></figure>"
-        ).format(src=src, caption=caption or "")
-
-    def _parse_article(self, response):
-        article = json.loads(response.text)["layout"]["center"][0]
-        il = FeedEntryItemLoader()
-        il.add_value(
-            "link", urljoin("https://{}".format(article["portal"]), article["url"])
-        )
-        il.add_value("title", article["title"])
-        if "teaser_img" in article:
-            il.add_value(
-                "content_html",
-                self._create_figure(
-                    article["teaser_img"]["url"],
-                    article["teaser_img"].get("description"),
-                ),
-            )
-        il.add_value(
-            "content_html", "<p><strong>{}</strong></p>".format(article["teaser_text"])
-        )
-        for paragraph in article["paragraphs"]:
-            if paragraph["type"] == "text":
-                il.add_value("content_html", paragraph["data"]["html"])
-            elif paragraph["type"] == "youtube":
-                url = "https://www.youtube.com/watch?v={}".format(
-                    paragraph["data"]["videoid"]
-                )
-                il.add_value(
-                    "content_html",
-                    '<div><a href="{url}">{url}</a></div>'.format(url=url),
-                )
-            elif paragraph["type"] == "image":
-                il.add_value(
-                    "content_html",
-                    self._create_figure(
-                        paragraph["data"]["url"].replace("large", "original"),
-                        paragraph["data"].get("description"),
+            if article["type"] != "empty":
+                yield scrapy.Request(
+                    "https://efs.kurier.at/api/v1/cfs/route?uri=/{}{}".format(
+                        article["portal"].replace(".", ""), article["url"]
                     ),
+                    parse_article,
+                    meta={
+                        "path": response.meta["path"],
+                        "feed_type": response.meta["feed_type"],
+                    },
                 )
-            elif paragraph["type"] == "gallery":
-                # Only include 1 image (the latest) if the feed type is article.
-                # This is is a special case for comic articles where a new image is
-                # added to the article once a day and it doesn't make sense to always
-                # include all the old ones in the feed.
-                max_images = 1 if response.meta["feed_type"] == "article" else None
-                for image in paragraph["data"]["images"][:max_images]:
-                    il.add_value(
-                        "content_html",
-                        self._create_figure(
-                            image["url"].replace("large", "original"),
-                            image.get("description"),
-                        ),
-                    )
-        il.add_value("updated", article["updated_date"])
-        for author in article["authors"]:
-            il.add_value("author_name", "{firstname} {lastname}".format(**author))
-        if not article["authors"]:
-            il.add_value("author_name", article["agency"])
-        il.add_value("category", article["channel"]["name"])
-        il.add_value("category", article["portal"])
-        il.add_value("path", response.meta["path"])
-        if article["sponsored"]:
-            il.add_value("category", "sponsored")
-        il.add_value("category", article.get("pretitle"))
-        return il.load_item()
 
     def _parse_author(self, response):
         query = json.loads(response.text)["layout"]["center"][0]["query"]
@@ -188,7 +194,7 @@ def _parse_search(self, response):
                 "https://efs.kurier.at/api/v1/cfs/route?uri=/{}{}".format(
                     article["portal"].replace(".", ""), article["url"]
                 ),
-                self._parse_article,
+                parse_article,
                 meta={
                     "path": response.meta["path"],
                     "feed_type": response.meta["feed_type"],

diff --git a/feeds/spiders/profil_at.py b/feeds/spiders/profil_at.py
@@ -1,69 +1,20 @@
-from datetime import datetime, timedelta
-
 import scrapy
-from dateutil.tz import gettz
 
-from feeds.loaders import FeedEntryItemLoader
-from feeds.spiders import FeedsXMLFeedSpider
+from feeds.spiders import FeedsXMLFeedSpider, kurier_at
 
 
 class ProfilAtSpider(FeedsXMLFeedSpider):
     name = "profil.at"
-    namespaces = [
-        ("i", "http://www.google.com/schemas/sitemap-image/1.1"),
-        ("rss", "http://www.sitemaps.org/schemas/sitemap/0.9"),
-    ]
-    itertag = "rss:url"
+    itertag = "item/link/text()"
     iterator = "xml"
+    start_urls = ["https://www.profil.at/xml/rss"]
 
     feed_title = "PROFIL"
     feed_subtitle = "Österreichs unabhängiges Nachrichtenmagazin"
 
-    def start_requests(self):
-        # Scrape this and last month so that the feed is not empty on the first day of a
-        # new month.
-        this_month = datetime.now(gettz("Europe/Vienna")).date().replace(day=1)
-        last_month = (this_month - timedelta(days=1)).replace(day=1)
-        for month in [this_month, last_month]:
-            yield scrapy.Request(
-                "https://www.{}/sitemap-articles-{}.xml".format(
-                    self.name, month.strftime("%Y-%m")
-                ),
-                meta={"dont_cache": True, "handle_httpstatus_list": [404]},
-            )
-
     def parse_node(self, response, node):
-        url = node.xpath("rss:loc/text()").extract_first()
-        updated = node.xpath("rss:lastmod/text()").extract_first()
-        return scrapy.Request(url, self.parse_item, meta={"updated": updated})
-
-    def parse_item(self, response):
-        remove_elems = [
-            "aside",
-            "script",
-            "h1",
-            "source",
-            ".breadcrumbs",
-            ".author-date",
-            ".artikel-social-kommentar",
-            ".bild-copyright",
-            ".ressortTitleMobile",
-            ".article-number",
-            ".artikel-kommentarlink",
-            ".umfrage-wrapper",
-            ".articleIssueInfo",
-        ]
-        il = FeedEntryItemLoader(
-            response=response,
-            base_url="https://{}".format(self.name),
-            remove_elems=remove_elems,
-        )
-        il.add_value("link", response.url)
-        author_name = (
-            response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)") or "Red."
+        path = node.extract().replace("https://profil.at/", "")
+        url = "https://efs.profil.at/api/v1/cfs/route?uri=/profilat/" + path
+        return scrapy.Request(
+            url, kurier_at.parse_article, meta={"feed_type": "article"}
         )
-        il.add_value("author_name", author_name)
-        il.add_css("title", 'h1[itemprop="headline"]::text')
-        il.add_value("updated", response.meta["updated"])
-        il.add_css("content_html", "article")
-        return il.load_item()
diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@
     include_package_data=True,
     install_requires=[
         "Click>=6.6",
-        "Scrapy>=1.6",
+        "Scrapy>=2.2",
         "bleach>=1.4.3",
         "dateparser>=0.5.1",
         "feedparser",
@@ -25,6 +25,7 @@
         "pyxdg>=0.26",
         "readability-lxml>=0.7",
         "scrapy-inline-requests",
+        "itemloaders",  # explicit dependency of Scrapy > 2.2.1
     ],
     extras_require={
         "docs": ["sphinx", "sphinx_rtd_theme"],