Skip to content

Commit

Permalink
Merge pull request #224 from Lukas0907/next
Browse files Browse the repository at this point in the history
Add spider for npr.org; fix economist.com
  • Loading branch information
Lukas0907 committed Dec 9, 2020
2 parents b643153 + f08116a commit 4d457af
Show file tree
Hide file tree
Showing 10 changed files with 154 additions and 36 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/tox.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---

name: PyFeeds CI
on: [push]

# Run tests against each supported version while docs and style are only
# considered on the latest supported version.
jobs:

build:
runs-on: ubuntu-latest
strategy:
matrix:
python:
- version: 3.6
toxenv: py36
- version: 3.7
toxenv: py37
- version: 3.8
toxenv: py38
- version: 3.9
toxenv: py39,docs,style

steps:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.python.version }} and tox ${{ matrix.python.toxenv }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python.version }}

- name: Install Tox
run: |
pip install tox
- name: Run Tox
run: |
tox -e ${{ matrix.python.toxenv }}
21 changes: 0 additions & 21 deletions .travis.yml

This file was deleted.

5 changes: 3 additions & 2 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ cleaning up the content (removing share buttons, etc.):
* :ref:`spider_arstechnica.com`
* :ref:`spider_derstandard.at`
* :ref:`spider_dietiwag.org`
* :ref:`spider_economist.com`
* :ref:`spider_ft.com`
* :ref:`spider_lwn.net`
* :ref:`spider_orf.at`
* :ref:`spider_profil.at`

Paywalled content
~~~~~~~~~~~~~~~~~
Expand Down Expand Up @@ -139,7 +139,6 @@ Utilizing the sitemap
~~~~~~~~~~~~~~~~~~~~~
Others provide a sitemap_ which we can parse:

* :ref:`spider_profil.at`
* :ref:`spider_trend.at`

Custom extraction
Expand All @@ -152,8 +151,10 @@ scraping from there.
* :ref:`spider_biblioweb.at`
* :ref:`spider_cbird.at`
* :ref:`spider_delinski.at`
* :ref:`spider_economist.com`
* :ref:`spider_flimmit.com`
* :ref:`spider_lbg.at`
* :ref:`spider_npr.org`
* :ref:`spider_openwrt.org`
* :ref:`spider_puls4.com`
* :ref:`spider_python-patterns.guide`
Expand Down
1 change: 1 addition & 0 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Most popular sites
spiders/ft.com
spiders/indiehackers.com
spiders/lwn.net
spiders/npr.org
spiders/spotify.com
spiders/vice.com

Expand Down
16 changes: 16 additions & 0 deletions docs/spiders/npr.org.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_npr.org:

npr.org
-------
Archive of the `Planet Money Newsletter
<https://www.npr.org/sections/money/newsletter>`_.

Configuration
~~~~~~~~~~~~~
Add ``npr.org`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
npr.org
1 change: 0 additions & 1 deletion feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ def _parse_breadcrumbs(breadcrumbs):
change_tags = {
".article-subtitle": "strong",
"aside": "blockquote",
"p strong:only-child": "h3",
}
replace_elems = {"img": _fix_img_src}
il = FeedEntryItemLoader(
Expand Down
21 changes: 11 additions & 10 deletions feeds/spiders/economist_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

from feeds.exceptions import DropResponse
from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
from feeds.spiders import FeedsSpider
from feeds.utils import generate_feed_header


class EconomistComSpider(FeedsXMLFeedSpider):
class EconomistComSpider(FeedsSpider):
name = "economist.com"
# Don't send too many requests to not trigger the bot detection.
custom_settings = {"DOWNLOAD_DELAY": 5.0}
Expand All @@ -24,20 +24,21 @@ def start_requests(self):

for ressort in self._ressorts:
yield scrapy.Request(
"https://www.{}/{}/rss.xml".format(self.name, ressort),
"https://www.{}/{}/".format(self.name, ressort),
meta={"dont_cache": True, "ressort": ressort},
)

def parse_node(self, response, node):
def parse(self, response):
if not self._titles.get(response.meta["ressort"]):
self._titles[response.meta["ressort"]] = response.xpath(
"//channel/title/text()"
self._titles[response.meta["ressort"]] = response.css(
"h1.section-collection-headline ::text"
).extract_first()

url = node.xpath("link/text()").extract_first()
return scrapy.Request(
url, self._parse_article, meta={"ressort": response.meta["ressort"]}
)
for path in response.css(".headline-link::attr('href')").extract():
url = response.urljoin(path)
yield scrapy.Request(
url, self._parse_article, meta={"ressort": response.meta["ressort"]}
)

def feed_headers(self):
for ressort in self._ressorts:
Expand Down
82 changes: 82 additions & 0 deletions feeds/spiders/npr_org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from urllib.parse import urljoin

import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider
from feeds.utils import generate_feed_header


class NprOrgSpider(FeedsSpider):
name = "npr.org"

_base_url = "https://www.{}".format(name)

def start_requests(self):
# Only Planet Money seems to have a public archive.
newsletters = ["money"]

for newsletter in newsletters:
yield scrapy.Request(
urljoin(self._base_url, "sections/{}/newsletter".format(newsletter)),
headers={"Cookie": "trackingChoice=true; choiceVersion=1"},
meta={"dont_cache": True, "path": newsletter},
)

def feed_headers(self):
return []

def parse(self, response):
for url in response.css('.item .title a::attr("href")').extract():
yield scrapy.Request(
url,
self._parse_article,
headers={"Cookie": "trackingChoice=true; choiceVersion=1"},
meta={"path": response.meta["path"]},
)

yield generate_feed_header(
title="{} Newsletter".format(
response.css(".branding__image-icon::attr('alt')").extract_first()
),
subtitle=response.css(".branding__mini-teaser ::text").extract_first(),
link=response.url,
logo=response.css(".branding__image-icon::attr('src')").extract_first(),
path=response.meta["path"],
)

def _parse_article(self, response):
def _fix_img_src(elem):
if "data-original" in elem.attrib:
elem.attrib["src"] = elem.attrib["data-original"]
return elem

remove_elems = [
".credit",
".hide-caption",
".toggle-caption",
".enlarge-options",
".enlarge_measure",
".enlarge_html",
".ad-backstage",
'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")',
'p:contains("Did you enjoy this newsletter segment?")',
]
replace_elems = {"img": _fix_img_src}
change_tags = {".image": "figure", ".credit-caption": "figcaption"}

il = FeedEntryItemLoader(
response=response,
base_url=self._base_url,
remove_elems=remove_elems,
replace_elems=replace_elems,
change_tags=change_tags,
)
il.add_css("title", "h1 ::text")
il.add_value("link", response.url)
il.add_css("content_html", "#storytext")
il.add_value("path", response.meta["path"])
il.add_css("updated", '.dateblock time::attr("datetime")')
il.add_css("author_name", ".byline__name a::text")

yield il.load_item()
1 change: 1 addition & 0 deletions feeds/spiders/wienerzeitung_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def _parse_breadcrumbs(breadcrumbs):
".caption-text > small.d-block",
"h2 > br",
"h3 > br",
".article-socials",
]
change_tags = {
".article-subtitle": "strong",
Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tox]
envlist = py36,py37,py38
envlist = py36,py37,py38,py39

[testenv]
description = Run tests
Expand All @@ -9,7 +9,7 @@ commands =

[testenv:style]
description = Run various style checks
basepython = python3.8
basepython = python3.9
extras = style
commands =
flake8 feeds
Expand Down

0 comments on commit 4d457af

Please sign in to comment.