Skip to content

Commit

Permalink
Merge pull request #169 from Lukas0907/newspiders
Browse files Browse the repository at this point in the history
Add spider for python-patterns.guide and dietiwag.org
  • Loading branch information
Lukas0907 committed Sep 14, 2018
2 parents 2e8a16c + 9a794d4 commit ea4d396
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 14 deletions.
14 changes: 5 additions & 9 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,9 @@ Writing a spider is easy! Consider the slightly simplified spider for
_title = "Indie Hackers"
def parse(self, response):
interviews = response.css(
".interview__link::attr(href), .interview__date::text"
).extract()
for link, date in zip(interviews[::2], interviews[1::2]):
interview_links = response.css(".interview__link::attr(href)").extract()
interview_dates = response.css(".interview__date::text").extract()
for link, date in zip(interview_links, interview_dates):
yield scrapy.Request(
response.urljoin(link),
self._parse_interview,
Expand All @@ -39,11 +38,6 @@ Writing a spider is easy! Consider the slightly simplified spider for
remove_elems = [
".shareable-quote",
".share-bar",
# Remove the last two h2s and all paragraphs below.
".interview-body > h2:last-of-type ~ p",
".interview-body > h2:last-of-type",
".interview-body > h2:last-of-type ~ p",
".interview-body > h2:last-of-type",
]
il = FeedEntryItemLoader(
response=response,
Expand Down Expand Up @@ -93,6 +87,7 @@ cleaning up the content (removing share buttons, etc.):
* :ref:`spider_addendum.org`
* :ref:`spider_arstechnica.com`
* :ref:`spider_derstandard.at`
* :ref:`spider_dietiwag.org`
* :ref:`spider_gnucash.org`
* :ref:`spider_lwn.net`
* :ref:`spider_orf.at`
Expand Down Expand Up @@ -140,6 +135,7 @@ scraping from there.
* :ref:`spider_indiehackers.com`
* :ref:`spider_openwrt.org`
* :ref:`spider_puls4.com`
* :ref:`spider_python-patterns.guide`
* :ref:`spider_usenix.org`
* :ref:`spider_verbraucherrecht.at`
* :ref:`spider_wienerlinien.at`
Expand Down
16 changes: 16 additions & 0 deletions docs/spiders/dietiwag.org.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.. _spider_dietiwag.org:

dietiwag.org
------------
Latest articles of `dietiwag.org <http://www.dietiwag.org>`_.

Configuration
~~~~~~~~~~~~~
Add ``dietiwag.org`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
dietiwag.org
21 changes: 21 additions & 0 deletions docs/spiders/python-patterns.guide.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.. _spider_python-patterns.guide:

python-patterns.guide
---------------------
The latest articles from python-patterns.guide_. Since articles on
python-patterns.guide_ do not have a publication date, the ``Last-Modified``
header is used for the updated field which might not be accurate or stable.
I.e. old articles might have a newer value in the updated field even if they
were not updated.

Configuration
~~~~~~~~~~~~~
Add ``python-patterns.guide`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
python-patterns.guide
.. _python-patterns.guide: http://python-patterns.guide
1 change: 1 addition & 0 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def _parse_article(self, response):
"style",
".icon-date",
".callToAction__button",
'a[href^="http://partners.webmasterplan.com/click.asp"]',
]
change_tags = {
"div.heroStage__introText": "strong",
Expand Down
64 changes: 64 additions & 0 deletions feeds/spiders/dietiwag_org.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import re

import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider


class DieTiwagOrgSpider(FeedsXMLFeedSpider):
name = "dietiwag.org"
allowed_domains = [name]
start_urls = ["http://www.dietiwag.org/rss.xml.php"]

_title = "dietiwag.org"
_subtitle = "die andere seite der tiroler wasser kraft"
_link = "http://www.{}".format(name)
_icon = "http://www.{}/favicon.ico".format(name)

def parse_node(self, response, node):
il = FeedEntryItemLoader(selector=node)
url = node.xpath("link/text()").extract_first()
il.add_value("link", url)
il.add_xpath("updated", "pubDate/text()")
il.add_xpath(
"title",
"title/text()",
# Use re.DOTALL since some titles have newlines in them.
re=re.compile("(?:Artikel|Tagebuch): (.*)", re.DOTALL),
)
yield scrapy.Request(url, self._parse_article, meta={"il": il})

def _parse_article(self, response):
remove_elems = [
".noprint",
"form",
".lineall > font[size='2'] > b:first-child",
"font[size='2'] > br:first-child",
"font[size='2'] > br:first-child",
"font[size='2'] > br:last-child",
"font[size='2'] > br:last-child",
"font[size='2'] > br:last-child",
]
replace_regex = {
r"\[\d{2}\.\d{2}\.\d{4}\]": "",
# A0 is a non-breaking space in latin1.
"\xA0": "",
r"<br>\s*<br>\s*\d{1,2}\.\d{1,2}\.\d{4}\s*<br>": "",
}
il = FeedEntryItemLoader(
response=response,
base_url=response.url,
remove_elems=remove_elems,
replace_regex=replace_regex,
parent=response.meta["il"],
)
il.add_css("author_name", ".sidebar .authors__name::text")
if response.css(".printwidth2"):
il.add_css("content_html", ".printwidth2 > font[size='2']")
il.add_css("content_html", ".printwidth2 > font[size='3'] > font[size='2']")
else:
# Tagebuch
il.add_css("content_html", ".lineall")
il.add_value("category", "Tagebuch")
yield il.load_item()
11 changes: 6 additions & 5 deletions feeds/spiders/falter_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ def parse_lokalfuehrer(self, response):
dayfirst=False,
)
il.add_value(
"path",
"lokalfuehrer_{}".format(response.meta["lokalfuehrer"])
"path", "lokalfuehrer_{}".format(response.meta["lokalfuehrer"])
)
il.add_value(
"link", "https://www.{}/lokal/{}".format(self.name, entry["id"])
Expand Down Expand Up @@ -129,12 +128,14 @@ def parse_lokalfuehrer(self, response):
il.add_value("content_html", entry["category_text"])
il.add_value(
"content_html",
"<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"])
"<p>{} {}, {}</p>".format(entry["zip"], entry["city"], entry["street"]),
)
il.add_value(
"content_html",
('<p><a href="https://www.google.com/maps?q={lat},{lon}">'
+ 'Google Maps</a></p>').format(**entry["location"])
(
'<p><a href="https://www.google.com/maps?q={lat},{lon}">'
+ "Google Maps</a></p>"
).format(**entry["location"]),
)
yield il.load_item()

Expand Down
37 changes: 37 additions & 0 deletions feeds/spiders/python_patterns_guide.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import scrapy

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsSpider


class PythonPatternsGuide(FeedsSpider):
name = "python-patterns.guide"
allowed_domains = [name]
start_urls = ["http://{}".format(name)]

_title = "Python Patterns"
_link = "http://{}".format(name)

def parse(self, response):
for path in response.css(".toctree-l1 > a::attr(href)").extract():
yield scrapy.Request(response.urljoin(path), self._parse_article)

def _parse_article(self, response):
remove_elems = ["h1", "#contents", ".headerlink"]
change_tags = {".admonition-title": "h2"}
il = FeedEntryItemLoader(
response=response,
base_url=response.url,
remove_elems=remove_elems,
change_tags=change_tags,
)
il.add_value("link", response.url)
il.add_value("author_name", "Brandon Rhodes")
# Use "Last-Modified" field or fall back to "Date".
updated = (
response.headers.get("Last-Modified", response.headers.get("Date"))
).decode("ascii")
il.add_value("updated", updated)
il.add_css("title", "title::text")
il.add_css("content_html", "body > .section")
yield il.load_item()

0 comments on commit ea4d396

Please sign in to comment.