Merge pull request #110 from Lukas0907/dev/next

Add spider for ;login: magazine, updates for Ö1 and Addendum
PyFeeds · Jan 4, 2018 · 98aeffb · 98aeffb
2 parents 87abc04 + e9c45c3
commit 98aeffb
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 18 deletions.
diff --git a/README.rst b/README.rst
@@ -23,6 +23,7 @@ Supported Websites
 
 Feeds is currently able to create Atom feeds for the following sites:
 
+* `addendum <https://www.addendum.org>`_: Newest articles
 * `ATV.at <http://www.atv.at>`_: Newest episodes of TV shows
 * `Bibliothek der Arbeiterkammer <http://ak.ciando.com>`_: Most recently added
   books to the e-library
@@ -42,6 +43,7 @@ Feeds is currently able to create Atom feeds for the following sites:
 * `profil <http://www.profil.at>`_: Newest articles
 * `puls4.com <http://www.puls4.com>`_: Newest episodes of TV shows
 * `Übermedien <http://www.uebermedien.de>`_: Newest articles
+* `USENIX <https://www.usenix.org>`_: Newest ;login: issues
 * `Verbraucherrecht <https://verbraucherrecht.at>`_: Newest articles
 * `VICE <https://www.vice.com>`_: Newest articles
 * `Wiener Linien <http://www.wienerlinien.at>`_: Newest articles

diff --git a/docs/spiders/usenix.org.rst b/docs/spiders/usenix.org.rst
@@ -0,0 +1,16 @@
+.. _spider_usenix.org:
+
+usenix.org
+----------
+Newest issues of the Usenix Magazine ;login:.
+
+Configuration
+~~~~~~~~~~~~~
+Add ``usenix.org`` to the list of spiders:
+
+.. code-block:: ini
+
+   # List of spiders to run by default, one per line.
+   spiders =
+     usenix.org
+
diff --git a/feeds/pipelines.py b/feeds/pipelines.py
@@ -2,7 +2,6 @@
 
 from scrapy import signals
 from scrapy.exceptions import DropItem
-from scrapy.utils.spider import iterate_spider_output
 
 from feeds.exporters import AtomExporter
 from feeds.items import FeedEntryItem
@@ -79,7 +78,7 @@ def spider_opened(self, spider):
 
     def spider_closed(self, spider):
         # Add feed header(s) at the end so they can be dynamic.
-        for feed_header in iterate_spider_output(spider.feed_headers()):
+        for feed_header in spider.feed_headers():
             self._exporters[spider].export_item(feed_header)
         self._exporters[spider].finish_exporting()
         self._exporters.pop(spider)

diff --git a/feeds/spiders/__init__.py b/feeds/spiders/__init__.py
@@ -28,7 +28,7 @@ def generate_feed_header(self, title=None, subtitle=None, link=None,
         return il.load_item()
 
     def feed_headers(self):
-        return self.generate_feed_header()
+        yield self.generate_feed_header()
 
     def start_requests(self):
         for url in self.start_urls:

diff --git a/feeds/spiders/addendum_org.py b/feeds/spiders/addendum_org.py
@@ -1,38 +1,42 @@
 import scrapy
 
 from feeds.loaders import FeedEntryItemLoader
-from feeds.spiders import FeedsSpider
+from feeds.spiders import FeedsXMLFeedSpider
 
 
-class AddendumOrgSpider(FeedsSpider):
+class AddendumOrgSpider(FeedsXMLFeedSpider):
     name = 'addendum.org'
     allowed_domains = [name]
-    start_urls = ['https://www.addendum.org/projekte-ubersicht/']
+    start_urls = ['https://www.addendum.org/feed/rss2-addendum']
 
     _title = 'Addendum'
     _subtitle = 'das, was fehlt'
     _link = 'https://www.{}'.format(name)
     _icon = ('https://www.{}/resources/dist/favicons/'
              'android-chrome-192x192.png').format(name)
     _timezone = 'Europe/Vienna'
+    _max_articles = 10
+    _num_articles = 0
 
-    def parse(self, response):
-        url = response.css('section::attr(data-url-project)').extract_first()
-        yield scrapy.Request(url, self.parse_item, meta={'dont_cache': True})
-
-    def parse_item(self, response):
-        # First URL is the overview page.
-        for url in (
-                response.css('.projectNav__meta a::attr(href)').extract()[1:]):
-            yield scrapy.Request(url, self._parse_article)
+    def parse_node(self, response, node):
+        url = node.xpath('link/text()').extract_first()
+        if not node.xpath('category'):
+            # Overview pages don't have a category.
+            return
+        if self._num_articles >= self._max_articles:
+            # Maximum number of articles reached.
+            return
+        self._num_articles += 1
+        yield scrapy.Request(url, self._parse_article)
 
     def _parse_article(self, response):
         remove_elems = [
             '.projectNav', 'h1', '.socialMedia__headline', '.whyRead',
             '.overlayCTA', '.authors', '.socialMedia', '.sidebar',
             '.sectionBackground--colorTheme1', '.heroStage__copyright',
             '.heroStage__downLink', 'script', 'iframe', '.image__zoom ',
-            '.image__copyrightWrapper'
+            '.image__copyrightWrapper', '.callToAction', '.print-action',
+            '.internalLink span',
         ]
         change_tags = {
             'div.heroStage__introText': 'strong',

diff --git a/feeds/spiders/facebook_com.py b/feeds/spiders/facebook_com.py
@@ -13,7 +13,7 @@ class FacebookComSpider(FeedsSpider):
     allowed_domains = ['facebook.com']
 
     def feed_headers(self):
-        return
+        return []
 
     def start_requests(self):
         app_id = self.spider_settings.get('app_id')

diff --git a/feeds/spiders/usenix_org.py b/feeds/spiders/usenix_org.py
@@ -0,0 +1,66 @@
+import re
+
+import scrapy
+
+from feeds.loaders import FeedEntryItemLoader
+from feeds.spiders import FeedsSpider
+
+
+class UsenixOrgSpider(FeedsSpider):
+    name = 'usenix.org'
+    allowed_domains = ['usenix.org']
+
+    def feed_headers(self):
+        return []
+
+    def start_requests(self):
+        yield scrapy.Request('https://www.usenix.org/publications/login',
+                             self.parse_login_issues,
+                             meta={'dont_cache': True})
+
+    def parse_login_issues(self, response):
+        # Only scrape the last 8 issues.
+        issues = response.css('.issues .month a::attr(href)').extract()[:8]
+        yield self.generate_feed_header(
+            title=';login:', subtitle='The Usenix Magazine', link=response.url,
+            path='login')
+        for issue in issues:
+            yield scrapy.Request(response.urljoin(issue),
+                                 self.parse_login_issue)
+
+    def parse_login_issue(self, response):
+        remove_elems = [
+            '.field-name-field-file-access',
+            '.field-name-field-login-issue-file',
+            '.field-name-field-product',
+            '.field-commerce-price',
+            '.views-field-field-file-access',
+            '.view-header',
+        ]
+        il = FeedEntryItemLoader(response=response,
+                                 base_url='https://www.{}'.format(self.name),
+                                 remove_elems=remove_elems,
+                                 dayfirst=True)
+        il.add_value('link', response.url)
+        title = response.css('h1::text').extract_first().strip()
+        il.add_value('title', title)
+        il.add_value('updated', self._date_from_title(title))
+        il.add_css('content_html', '.content-wrapper')
+        il.add_value('path', 'login')
+        if response.css('.usenix-files-protected'):
+            il.add_value('category', 'paywalled')
+        yield il.load_item()
+
+    def _date_from_title(self, issue):
+        """Try to guess the publication date of an issue from the title."""
+        match = re.search(
+            r'(?P<season>Spring|Summer|Fall|Winter) (?P<year>\d{4})', issue)
+        if match:
+            seasons = {'Spring': '03', 'Summer': '06', 'Fall': '09',
+                       'Winter': '12'}
+            month = seasons[match.group('season')]
+            return '01-{month}-{year}'.format(
+                month=month, year=match.group('year'))
+        else:
+            self.logger.warning(
+                'Could not extract date from title "{}"!'.format(issue))
diff --git a/feeds/spiders/vice_com.py b/feeds/spiders/vice_com.py
@@ -18,7 +18,7 @@ class ViceComSpider(FeedsSpider):
 
     def feed_headers(self):
         if not self._locales:
-            return
+            return []
 
         for locale in self._locales:
             yield self.generate_feed_header(