Skip to content

Commit

Permalink
Merge pull request #110 from Lukas0907/dev/next
Browse files Browse the repository at this point in the history
Add spider for ;login: magazine, updates for Ö1 and Addendum
  • Loading branch information
nblock committed Jan 4, 2018
2 parents 87abc04 + e9c45c3 commit 98aeffb
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 18 deletions.
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Supported Websites

Feeds is currently able to create Atom feeds for the following sites:

* `addendum <https://www.addendum.org>`_: Newest articles
* `ATV.at <http://www.atv.at>`_: Newest episodes of TV shows
* `Bibliothek der Arbeiterkammer <http://ak.ciando.com>`_: Most recently added
books to the e-library
Expand All @@ -42,6 +43,7 @@ Feeds is currently able to create Atom feeds for the following sites:
* `profil <http://www.profil.at>`_: Newest articles
* `puls4.com <http://www.puls4.com>`_: Newest episodes of TV shows
* `Übermedien <http://www.uebermedien.de>`_: Newest articles
* `USENIX <https://www.usenix.org>`_: Newest ;login: issues
* `Verbraucherrecht <https://verbraucherrecht.at>`_: Newest articles
* `VICE <https://www.vice.com>`_: Newest articles
* `Wiener Linien <http://www.wienerlinien.at>`_: Newest articles
Expand Down
16 changes: 16 additions & 0 deletions docs/spiders/usenix.org.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_usenix.org:

usenix.org
----------
Newest issues of the Usenix Magazine ;login:.

Configuration
~~~~~~~~~~~~~
Add ``usenix.org`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
usenix.org
3 changes: 1 addition & 2 deletions feeds/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from scrapy import signals
from scrapy.exceptions import DropItem
from scrapy.utils.spider import iterate_spider_output

from feeds.exporters import AtomExporter
from feeds.items import FeedEntryItem
Expand Down Expand Up @@ -79,7 +78,7 @@ def spider_opened(self, spider):

def spider_closed(self, spider):
# Add feed header(s) at the end so they can be dynamic.
for feed_header in iterate_spider_output(spider.feed_headers()):
for feed_header in spider.feed_headers():
self._exporters[spider].export_item(feed_header)
self._exporters[spider].finish_exporting()
self._exporters.pop(spider)
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def generate_feed_header(self, title=None, subtitle=None, link=None,
return il.load_item()

def feed_headers(self):
return self.generate_feed_header()
yield self.generate_feed_header()

def start_requests(self):
for url in self.start_urls:
Expand Down
30 changes: 17 additions & 13 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,42 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
from feeds.spiders import FeedsXMLFeedSpider


class AddendumOrgSpider(FeedsSpider):
class AddendumOrgSpider(FeedsXMLFeedSpider):
name = 'addendum.org'
allowed_domains = [name]
start_urls = ['https://www.addendum.org/projekte-ubersicht/']
start_urls = ['https://www.addendum.org/feed/rss2-addendum']

_title = 'Addendum'
_subtitle = 'das, was fehlt'
_link = 'https://www.{}'.format(name)
_icon = ('https://www.{}/resources/dist/favicons/'
'android-chrome-192x192.png').format(name)
_timezone = 'Europe/Vienna'
_max_articles = 10
_num_articles = 0

def parse(self, response):
url = response.css('section::attr(data-url-project)').extract_first()
yield scrapy.Request(url, self.parse_item, meta={'dont_cache': True})

def parse_item(self, response):
# First URL is the overview page.
for url in (
response.css('.projectNav__meta a::attr(href)').extract()[1:]):
yield scrapy.Request(url, self._parse_article)
def parse_node(self, response, node):
url = node.xpath('link/text()').extract_first()
if not node.xpath('category'):
# Overview pages don't have a category.
return
if self._num_articles >= self._max_articles:
# Maximum number of articles reached.
return
self._num_articles += 1
yield scrapy.Request(url, self._parse_article)

def _parse_article(self, response):
remove_elems = [
'.projectNav', 'h1', '.socialMedia__headline', '.whyRead',
'.overlayCTA', '.authors', '.socialMedia', '.sidebar',
'.sectionBackground--colorTheme1', '.heroStage__copyright',
'.heroStage__downLink', 'script', 'iframe', '.image__zoom ',
'.image__copyrightWrapper'
'.image__copyrightWrapper', '.callToAction', '.print-action',
'.internalLink span',
]
change_tags = {
'div.heroStage__introText': 'strong',
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/facebook_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class FacebookComSpider(FeedsSpider):
allowed_domains = ['facebook.com']

def feed_headers(self):
return
return []

def start_requests(self):
app_id = self.spider_settings.get('app_id')
Expand Down
66 changes: 66 additions & 0 deletions feeds/spiders/usenix_org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import re

import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider


class UsenixOrgSpider(FeedsSpider):
name = 'usenix.org'
allowed_domains = ['usenix.org']

def feed_headers(self):
return []

def start_requests(self):
yield scrapy.Request('https://www.usenix.org/publications/login',
self.parse_login_issues,
meta={'dont_cache': True})

def parse_login_issues(self, response):
# Only scrape the last 8 issues.
issues = response.css('.issues .month a::attr(href)').extract()[:8]
yield self.generate_feed_header(
title=';login:', subtitle='The Usenix Magazine', link=response.url,
path='login')
for issue in issues:
yield scrapy.Request(response.urljoin(issue),
self.parse_login_issue)

def parse_login_issue(self, response):
remove_elems = [
'.field-name-field-file-access',
'.field-name-field-login-issue-file',
'.field-name-field-product',
'.field-commerce-price',
'.views-field-field-file-access',
'.view-header',
]
il = FeedEntryItemLoader(response=response,
base_url='https://www.{}'.format(self.name),
remove_elems=remove_elems,
dayfirst=True)
il.add_value('link', response.url)
title = response.css('h1::text').extract_first().strip()
il.add_value('title', title)
il.add_value('updated', self._date_from_title(title))
il.add_css('content_html', '.content-wrapper')
il.add_value('path', 'login')
if response.css('.usenix-files-protected'):
il.add_value('category', 'paywalled')
yield il.load_item()

def _date_from_title(self, issue):
"""Try to guess the publication date of an issue from the title."""
match = re.search(
r'(?P<season>Spring|Summer|Fall|Winter) (?P<year>\d{4})', issue)
if match:
seasons = {'Spring': '03', 'Summer': '06', 'Fall': '09',
'Winter': '12'}
month = seasons[match.group('season')]
return '01-{month}-{year}'.format(
month=month, year=match.group('year'))
else:
self.logger.warning(
'Could not extract date from title "{}"!'.format(issue))
2 changes: 1 addition & 1 deletion feeds/spiders/vice_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ViceComSpider(FeedsSpider):

def feed_headers(self):
if not self._locales:
return
return []

for locale in self._locales:
yield self.generate_feed_header(
Expand Down

0 comments on commit 98aeffb

Please sign in to comment.