Skip to content

Commit

Permalink
Merge pull request #152 from Lukas0907/ars
Browse files Browse the repository at this point in the history
Add spider for arstechnica.com.
  • Loading branch information
Lukas0907 committed Aug 19, 2018
2 parents dfee6af + ab0f98c commit 19b1f78
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 0 deletions.
43 changes: 43 additions & 0 deletions docs/spiders/arstechnica.com.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
.. _spider_arstechnica.com:

arstechnica.com
---------------
Full text feeds for `Ars Technica <https://arstechnica.com>`_.

Configuration
~~~~~~~~~~~~~
Add ``arstechnica.com`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
arstechnica.com
arstechnica.com supports different channels via the ``channels`` parameter
(one per line). If no channel is given, ``features`` is used. Go to
`RSS feeds <https://arstechnica.com/rss-feeds/>`_ for a list of all feeds.

.. code-block:: ini
[arstechnica.com]
channels =
index
features
technology-lab
gadgets
business
security
tech-policy
apple
gaming
science
multiverse
cars
staff-blogs
cardboard
open-source
microsoft
software
telecom
web
22 changes: 22 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,25 @@ useragent = feeds (+https://github.com/nblock/feeds)
# 4748
# etat
# immobilien

#[arstechnica.com]
#channels =
# index
# features
# technology-lab
# gadgets
# business
# security
# tech-policy
# apple
# gaming
# science
# multiverse
# cars
# staff-blogs
# cardboard
# open-source
# microsoft
# software
# telecom
# web
75 changes: 75 additions & 0 deletions feeds/spiders/arstechnica_com.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider


class ArsTechnicaComSpider(FeedsXMLFeedSpider):
name = "arstechnica.com"
allowed_domains = [name]
itertag = "item"

_icon = (
"https://cdn.arstechnica.net/wp-content/uploads/2016/10/"
+ "cropped-ars-logo-512_480-32x32.png"
)
_logo = (
"https://cdn.arstechnica.net/wp-content/themes/ars-mobile/assets/images/"
+ "material-ars.png"
)

def start_requests(self):
channels = self.settings.get("FEEDS_SPIDER_ARSTECHNICA_COM_CHANNELS")
if channels:
channels = set(channels.split())
else:
channels = {"features"}

for channel in channels:
yield scrapy.Request(
"http://feeds.{}/arstechnica/{}".format(self.name, channel),
meta={"path": channel, "dont_cache": True},
)

self._channels = channels

def feed_headers(self):
for channel in self._channels:
yield self.generate_feed_header(
title="Ars Technica: {}".format(channel.title()),
link="https://{}".format(self.name),
path=channel,
)

def parse_node(self, response, node):
link = node.xpath("link/text()").extract_first()
il = FeedEntryItemLoader()
il.add_value("title", node.xpath("title/text()").extract_first())
il.add_value("updated", node.xpath("pubDate/text()").extract_first())
il.add_value("category", node.xpath("category/text()").extract())
yield scrapy.Request(
link,
self._parse_article,
cookies={"view": "mobile"},
meta={"il": il, "path": response.meta["path"], "first_page": True},
)

def _parse_article(self, response):
remove_elems = [".caption-credit", ".gallery-image-credit"]
il = FeedEntryItemLoader(
response=response, parent=response.meta["il"], remove_elems=remove_elems
)
if response.meta.get("first_page", False):
il.add_value("link", response.url)
il.add_css("author_name", ".byline a span ::text")
il.add_css("content_html", "header h2")
il.add_value("path", response.meta["path"])
il.add_css("content_html", ".article-content")
if response.css(".next"):
yield scrapy.Request(
response.css(".numbers a::attr(href)").extract()[-1],
self._parse_article,
meta={"il": il, "path": response.meta["path"]},
)
else:
yield il.load_item()

0 comments on commit 19b1f78

Please sign in to comment.