-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #224 from Lukas0907/next
Add spider for npr.org; fix economist.com
- Loading branch information
Showing
10 changed files
with
154 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
--- | ||
|
||
name: PyFeeds CI | ||
on: [push] | ||
|
||
# Run tests against each supported version while docs and style are only | ||
# considered on the latest supported version. | ||
jobs: | ||
|
||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python: | ||
- version: 3.6 | ||
toxenv: py36 | ||
- version: 3.7 | ||
toxenv: py37 | ||
- version: 3.8 | ||
toxenv: py38 | ||
- version: 3.9 | ||
toxenv: py39,docs,style | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up Python ${{ matrix.python.version }} and tox ${{ matrix.python.toxenv }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python.version }} | ||
|
||
- name: Install Tox | ||
run: | | ||
pip install tox | ||
- name: Run Tox | ||
run: | | ||
tox -e ${{ matrix.python.toxenv }} |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
.. _spider_npr.org: | ||
|
||
npr.org | ||
------- | ||
Archive of the `Planet Money Newsletter | ||
<https://www.npr.org/sections/money/newsletter>`_. | ||
|
||
Configuration | ||
~~~~~~~~~~~~~ | ||
Add ``npr.org`` to the list of spiders: | ||
|
||
.. code-block:: ini | ||
# List of spiders to run by default, one per line. | ||
spiders = | ||
npr.org |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from urllib.parse import urljoin | ||
|
||
import scrapy | ||
|
||
from feeds.loaders import FeedEntryItemLoader | ||
from feeds.spiders import FeedsSpider | ||
from feeds.utils import generate_feed_header | ||
|
||
|
||
class NprOrgSpider(FeedsSpider): | ||
name = "npr.org" | ||
|
||
_base_url = "https://www.{}".format(name) | ||
|
||
def start_requests(self): | ||
# Only Planet Money seems to have a public archive. | ||
newsletters = ["money"] | ||
|
||
for newsletter in newsletters: | ||
yield scrapy.Request( | ||
urljoin(self._base_url, "sections/{}/newsletter".format(newsletter)), | ||
headers={"Cookie": "trackingChoice=true; choiceVersion=1"}, | ||
meta={"dont_cache": True, "path": newsletter}, | ||
) | ||
|
||
def feed_headers(self): | ||
return [] | ||
|
||
def parse(self, response): | ||
for url in response.css('.item .title a::attr("href")').extract(): | ||
yield scrapy.Request( | ||
url, | ||
self._parse_article, | ||
headers={"Cookie": "trackingChoice=true; choiceVersion=1"}, | ||
meta={"path": response.meta["path"]}, | ||
) | ||
|
||
yield generate_feed_header( | ||
title="{} Newsletter".format( | ||
response.css(".branding__image-icon::attr('alt')").extract_first() | ||
), | ||
subtitle=response.css(".branding__mini-teaser ::text").extract_first(), | ||
link=response.url, | ||
logo=response.css(".branding__image-icon::attr('src')").extract_first(), | ||
path=response.meta["path"], | ||
) | ||
|
||
def _parse_article(self, response): | ||
def _fix_img_src(elem): | ||
if "data-original" in elem.attrib: | ||
elem.attrib["src"] = elem.attrib["data-original"] | ||
return elem | ||
|
||
remove_elems = [ | ||
".credit", | ||
".hide-caption", | ||
".toggle-caption", | ||
".enlarge-options", | ||
".enlarge_measure", | ||
".enlarge_html", | ||
".ad-backstage", | ||
'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")', | ||
'p:contains("Did you enjoy this newsletter segment?")', | ||
] | ||
replace_elems = {"img": _fix_img_src} | ||
change_tags = {".image": "figure", ".credit-caption": "figcaption"} | ||
|
||
il = FeedEntryItemLoader( | ||
response=response, | ||
base_url=self._base_url, | ||
remove_elems=remove_elems, | ||
replace_elems=replace_elems, | ||
change_tags=change_tags, | ||
) | ||
il.add_css("title", "h1 ::text") | ||
il.add_value("link", response.url) | ||
il.add_css("content_html", "#storytext") | ||
il.add_value("path", response.meta["path"]) | ||
il.add_css("updated", '.dateblock time::attr("datetime")') | ||
il.add_css("author_name", ".byline__name a::text") | ||
|
||
yield il.load_item() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters