Merge pull request #115 from misspenalty/zeitdiebin

Add spider for zeit.diebin.at
PyFeeds · Mar 18, 2018 · a5ccb4b · a5ccb4b
2 parents 03ddc2d + 3199c6f
commit a5ccb4b
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 0 deletions.
diff --git a/README.rst b/README.rst
@@ -49,6 +49,7 @@ Feeds is currently able to create Atom feeds for the following sites:
 * `Verbraucherrecht <https://verbraucherrecht.at>`_: Newest articles
 * `VICE <https://www.vice.com>`_: Newest articles
 * `Wiener Linien <http://www.wienerlinien.at>`_: Newest articles
+* `zeitdiebin <https://zeit.diebin.at/>`_: Newest events
 
 Some sites (Falter, Konsument, LWN, Oberösterreichische Nachrichten,
 Übermedien) offer articles only behind a paywall. If you have a paid

diff --git a/docs/spiders/zeit.diebin.at.rst b/docs/spiders/zeit.diebin.at.rst
@@ -0,0 +1,15 @@
+.. _zeit.diebin.at:
+
+zeit.diebin.at
+-------------------
+Newest articles from `zeitdiebin <https://zeit.diebin.at/>`_.
+
+Configuration
+~~~~~~~~~~~~~
+Add ``zeit.diebin.at`` to the list of spiders:
+
+.. code-block:: ini
+
+   # List of spiders to run by default, one per line.
+   spiders =
+     zeit.diebin.at
diff --git a/feeds/spiders/zeitdiebin_at.py b/feeds/spiders/zeitdiebin_at.py
@@ -0,0 +1,53 @@
+import scrapy
+import datetime
+
+from feeds.loaders import FeedEntryItemLoader
+from feeds.spiders import FeedsSpider
+
+
+class ZeitdiebinAtSpider(FeedsSpider):
+    name = 'zeit.diebin.at'
+    allowed_domains = ['zeit.diebin.at']
+    start_urls = ['https://zeit.diebin.at/upcoming']
+
+    _title = 'zeitdiebin'
+    _subtitle = 'irgendwas ist immer ...'
+    _link = 'https://{}'.format(name)
+    _logo = 'https://{}/favicon.ico'.format(name)
+    _timezone = 'Europe/Vienna'
+
+    def parse(self, response):
+        for link in response.css(
+               'a[href*=events]::attr(href)').re(r'events/\d+'):
+            yield scrapy.Request(response.urljoin(link), self.parse_item)
+
+    def parse_item(self, response):
+        il = FeedEntryItemLoader(
+            response=response,
+            base_url='{}/'.format(self._link),
+            timezone=self._timezone,
+            dayfirst=True,
+            remove_elems=['.ruler', 'h1']
+        )
+
+        il.add_css('title', 'h1.event-title::text')
+        il.add_value('link', response.url)
+
+        date = response.css('title').re_first(r'(\d{2}\.\d{2}\.\d{4})')
+        time = response.css('title').re_first(r'(\d{2}:\d{2})') or ''
+        if date:
+            il.add_value('updated', '{} {}'.format(date, time))
+        else:
+            day_month = response.css('title').re_first(r'\d{2}\.\d{2}')
+            if day_month:
+                il.add_value('updated', '{}.{} {}'.format(
+                    day_month, datetime.datetime.now().year, time))
+            else:
+                pass
+                # Item is skipped.
+
+        il.add_css('content_html', 'div#content.container')
+
+        yield il.load_item()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 smartindent autoindent