Merge pull request #185 from Lukas0907/next

Update TU Wien, diepresse, derstandard, add delinski
PyFeeds · Oct 21, 2018 · 037ce53 · 037ce53
2 parents 7c966e8 + b28a127
commit 037ce53
Show file tree

Hide file tree

Showing 7 changed files with 125 additions and 34 deletions.
diff --git a/docs/development.rst b/docs/development.rst
@@ -132,6 +132,7 @@ scraping from there.
   * :ref:`spider_atv.at`
   * :ref:`spider_biblioweb.at`
   * :ref:`spider_cbird.at`
+  * :ref:`spider_delinski.at`
   * :ref:`spider_help.gv.at`
   * :ref:`spider_indiehackers.com`
   * :ref:`spider_openwrt.org`

diff --git a/docs/spiders/delinski.at.rst b/docs/spiders/delinski.at.rst
@@ -0,0 +1,15 @@
+.. _spider_delinski.at:
+
+delinski.at
+-----------
+Newest restaurants in Wien bookable at `Delinski <https://delinski.at>`_.
+
+Configuration
+~~~~~~~~~~~~~
+Add ``delinski.at`` to the list of spiders:
+
+.. code-block:: ini
+
+   # List of spiders to run by default, one per line.
+   spiders =
+     delinski.at
diff --git a/feeds/spiders/delinski_at.py b/feeds/spiders/delinski_at.py
@@ -0,0 +1,60 @@
+import json
+import re
+from datetime import datetime, timedelta
+
+import scrapy
+
+from feeds.loaders import FeedEntryItemLoader
+from feeds.spiders import FeedsSpider
+
+
+class DelinskiAtSpider(FeedsSpider):
+    name = "delinski.at"
+
+    feed_title = "Delinski"
+    feed_link = "https://{}".format(name)
+    feed_logo = "https://{}/favicon.ico".format(name)
+
+    def start_requests(self):
+        yield scrapy.Request(
+            "https://www.delinski.at/wien/restaurants",
+            # The restaurants page is not cached and takes a few seconds to load.
+            # Don't query more than once a day.
+            meta={"cache_expires": timedelta(days=1)},
+        )
+
+    def parse(self, response):
+        m = re.search("window.DELINSKI, {listViewEntities: (.*)}", response.text)
+        restaurants = sorted(
+            json.loads(m.group(1))["restaurants"]["entities"].values(),
+            key=lambda r: int(r["created"]),
+            reverse=True,
+        )
+        for restaurant in restaurants[:20]:
+            il = FeedEntryItemLoader(timezone="Europe/Vienna", base_url=response.url)
+            url = response.urljoin(restaurant["url"])
+            il.add_value("link", url)
+            il.add_value("title", restaurant["name"])
+            content = """
+            <img src="{image}">
+            <ul>
+                <li>{address}</li>
+                <li>{price_range_human}</li>
+                <li>{cuisine_text}</li>
+            </ul>
+            """
+            il.add_value("content_html", content.format(**restaurant))
+            il.add_value("updated", datetime.fromtimestamp(int(restaurant["created"])))
+            yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
+
+    def _parse_restaurant(self, response):
+        il = FeedEntryItemLoader(
+            response=response,
+            base_url=response.url,
+            parent=response.meta["il"],
+            remove_elems=[".external"],
+        )
+        il.add_css("content_html", ".content .right p")
+        il.add_css("content_html", ".restaurant-link")
+        il.add_css("category", ".tags a ::text")
+        yield il.load_item()
diff --git a/feeds/spiders/derstandard_at.py b/feeds/spiders/derstandard_at.py
@@ -96,8 +96,9 @@ def _fix_img_src(elem):
             ".sequence-number",
             ".js-embed-output",
             "#mycountrytalks-embed",
-            # Remove self-promotion for ressorts (links starting with "/r").
-            '.js-embed-output-feeds a[href^="/r"]',
+            # Remove self-promotion for (other) ressorts.
+            '.js-embed-output-feeds > a[href^="/r"]',
+            '.js-embed-output-feeds > a[href^="https://derstandard.at/"]',
         ]
         change_tags = {
             "#media-list li .description": "figcaption",

diff --git a/feeds/spiders/diepresse_com.py b/feeds/spiders/diepresse_com.py
@@ -75,7 +75,12 @@ def _clean_caption(elem):
         il = FeedEntryItemLoader(
             response=response,
             parent=response.meta["il"],
-            remove_elems=[".ad", ".article-paid"],
+            remove_elems=[
+                ".ad",
+                ".article-paid",
+                ".js-overlay-close",
+                ".swiper-lazy-preloader",
+            ],
             change_tags={".article__lead": "strong"},
             pullup_elems={".zoomable__image--zoomed": 2},
             change_attribs={".zoomable__image--zoomed": {"data-src": "src"}},
@@ -84,14 +89,12 @@ def _clean_caption(elem):
         )
         il.add_css(
             "author_name",
-            ".article__main .article__author ::text",
+            "article .article__author ::text",
             re=re.compile(r"\s*(?:[Vv]on\s*)?(.+)", flags=re.DOTALL),
         )
-        il.add_css("content_html", ".article__main .article__media")
-        il.add_css(
-            "content_html", ".article__main .article__lead"
-        )  # change tags to strong
-        il.add_css("content_html", ".article__main .article__body")
+        il.add_css("content_html", "article .article__media .zoomable__inner")
+        il.add_css("content_html", "article .article__lead")  # change tags to strong
+        il.add_css("content_html", "article .article__body")
         if response.css(".article-paid"):
             il.add_value("category", "paywalled")
         il.add_value("category", section.split("/"))

diff --git a/feeds/spiders/tuwien_ac_at.py b/feeds/spiders/tuwien_ac_at.py
@@ -1,8 +1,8 @@
+import json
 import re
 
 import scrapy
-from scrapy.loader.processors import TakeFirst
-from scrapy.selector import Selector
+from inline_requests import inline_requests
 
 from feeds.loaders import FeedEntryItemLoader
 from feeds.spiders import FeedsSpider
@@ -21,31 +21,42 @@ def start_requests(self):
             meta={"dont_cache": True},
         )
 
+    @inline_requests
     def parse(self, response):
         mitteilungsblaetter = response.css(".mitteilungsblaetter")
-        updated = mitteilungsblaetter.css("::text").re_first("(\d{2}\.\d{2}\.\d{4})")
+        updated = mitteilungsblaetter.css("::text").re_first(r"(\d{2}\.\d{2}\.\d{4})")
         link = response.urljoin(
             mitteilungsblaetter.css('a::attr("href")').extract_first()
         )
-        return scrapy.Request(
-            response.urljoin(link),
-            self._parse_mitteilungsblatt,
-            meta={"updated": updated},
-        )
 
-    def _parse_mitteilungsblatt(self, response):
-        content = "".join(response.css("#contentInner > div").extract())
-        for entry in re.split('<a name="n\d*">', content)[1:]:
-            entry = Selector(text=entry)
-            il = FeedEntryItemLoader(
-                selector=entry,
-                base_url="https://tiss.{}".format(self.name),
-                timezone="Europe/Vienna",
-                dayfirst=True,
-            )
-            il.add_value("updated", response.meta["updated"])
-            anchor_name = entry.css('::attr("name")').extract_first()
-            il.add_value("link", response.url + "#{}".format(anchor_name))
-            il.add_css("title", "strong > u ::text", TakeFirst())
-            il.add_css("content_html", "p")
-            yield il.load_item()
+        response = yield scrapy.Request(link, method="HEAD")
+        mb_url = response.url
+        mb_id = re.search(
+            r"https://tiss.tuwien.ac.at/mbl/blatt_struktur/anzeigen/(\d+)", mb_url
+        ).group(1)
+
+        url = "https://tiss.{}/api/mbl/v22/id/{}".format(self.name, mb_id)
+        response = yield scrapy.Request(url)
+
+        last_entry = None
+        for entry in reversed(json.loads(response.text)["knoten"]):
+            (entry["main"], entry["sub"]) = re.match(
+                r"(\d+)\.?(\d*)", entry["counter"]
+            ).groups()
+            if last_entry is not None and last_entry["main"] == entry["main"]:
+                entry["inhalt"] += "<h2>{}</h2>".format(last_entry["titel"])
+                entry["inhalt"] += last_entry["inhalt"]
+            if entry["sub"] == "":
+                il = FeedEntryItemLoader(
+                    base_url="https://tiss.{}".format(self.name),
+                    timezone="Europe/Vienna",
+                    dayfirst=True,
+                )
+                il.add_value("updated", updated)
+                il.add_value("link", mb_url + "#{}".format(entry["counter"]))
+                il.add_value("title", entry["titel"])
+                il.add_value("content_html", entry["inhalt"])
+                yield il.load_item()
+                last_entry = None
+            else:
+                last_entry = entry
diff --git a/feeds/spiders/uebermedien_de.py b/feeds/spiders/uebermedien_de.py
@@ -4,8 +4,8 @@
 from urllib.parse import parse_qs, urlparse
 
 import scrapy
-from scrapy.http import FormRequest
 from inline_requests import inline_requests
+from scrapy.http import FormRequest
 
 from feeds.loaders import FeedEntryItemLoader
 from feeds.spiders import FeedsXMLFeedSpider