Skip to content

Commit

Permalink
Merge pull request #177 from Lukas0907/next
Browse files Browse the repository at this point in the history
Add spider for kurier.at, tuwien.ac.at, various fixes
  • Loading branch information
Lukas0907 committed Oct 4, 2018
2 parents 824baca + 6487213 commit 017d14f
Show file tree
Hide file tree
Showing 22 changed files with 397 additions and 64 deletions.
2 changes: 2 additions & 0 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Some use a REST API which we can use to fetch the content.

* :ref:`spider_facebook.com`
* :ref:`spider_falter.at`
* :ref:`spider_kurier.at`
* :ref:`spider_oe1.orf.at`
* :ref:`spider_tvthek.orf.at`
* :ref:`spider_vice.com`
Expand All @@ -136,6 +137,7 @@ scraping from there.
* :ref:`spider_openwrt.org`
* :ref:`spider_puls4.com`
* :ref:`spider_python-patterns.guide`
* :ref:`spider_tuwien.ac.at`
* :ref:`spider_ubup.com`
* :ref:`spider_usenix.org`
* :ref:`spider_verbraucherrecht.at`
Expand Down
36 changes: 36 additions & 0 deletions docs/spiders/kurier.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
.. _spider_kurier.at:

kurier.at
---------
Newest articles from Kurier.at_.

Configuration
~~~~~~~~~~~~~
Add ``kurier.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
kurier.at
kurier.at supports different channels via the ``channels`` parameter, articles
via the ``articles`` parameter and authors via the ``authors`` parameter (one
per line).

Example configuration:

.. code-block:: ini
[kurier.at]
channels =
/chronik/wien
articles =
/meinung/pammesberger-2018-die-karikatur-zum-tag/309.629.015/slideshow
authors =
niki.glattauer
guido.tartarotti
florian.holzer
barbara.kaufmann
.. _Kurier.at: https://kurier.at
15 changes: 15 additions & 0 deletions docs/spiders/tuwien.ac.at.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
.. _spider_tuwien.ac.at:

tuwien.ac.at
------------
Newest Mitteilungsblätter issued by `TU Wien <https://tuwien.ac.at>`_.

Configuration
~~~~~~~~~~~~~
Add ``tuwien.ac.at`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
tuwien.ac.at
11 changes: 11 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,14 @@ useragent = feeds (+https://github.com/nblock/feeds)
#[diepresse.com]
#sections =
# Meinung/Pizzicato

#[kurier.at]
#channels =
# /chronik/wien
#articles =
# /meinung/pammesberger-2018-die-karikatur-zum-tag/309.629.015/slideshow
#authors =
# niki.glattauer
# guido.tartarotti
# florian.holzer
# barbara.kaufmann
4 changes: 2 additions & 2 deletions feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from datetime import datetime, timezone
from time import time

from scrapy.extensions.httpcache import FilesystemCacheStorage, DummyPolicy
import scrapy
from scrapy.extensions.httpcache import DummyPolicy, FilesystemCacheStorage
from scrapy.utils.python import to_bytes
from scrapy.utils.request import request_fingerprint
import scrapy

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion feeds/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
HTTPCACHE_POLICY = "feeds.cache.FeedsCachePolicy"
HTTPCACHE_DIR = save_cache_path("feeds")
HTTPCACHE_EXPIRATION_SECS = FEEDS_CONFIG_CACHE_EXPIRES * 24 * 60 * 60
HTTPCACHE_IGNORE_HTTP_CODES = [403, 404] + list(range(500, 600))
HTTPCACHE_IGNORE_HTTP_CODES = list(range(400, 600))

RETRY_ENABLED = True
# equals 5 requests in total
Expand Down
9 changes: 6 additions & 3 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from copy import deepcopy
from datetime import datetime
from textwrap import TextWrapper
from urllib.parse import quote_plus as urlquote_plus
from urllib.parse import urljoin

import dateparser
Expand All @@ -13,7 +14,7 @@
from lxml.cssselect import CSSSelector
from lxml.html.clean import Cleaner
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Compose, Join, MapCompose, TakeFirst, Identity
from scrapy.loader.processors import Compose, Identity, Join, MapCompose, TakeFirst
from w3lib.html import remove_tags

from feeds.items import FeedEntryItem, FeedItem
Expand Down Expand Up @@ -207,8 +208,9 @@ def convert_footnotes(tree, loader_context):
for elem_sel in loader_context.get("convert_footnotes", []):
selector = CSSSelector(elem_sel)
for elem in selector(tree):
elem.tag = "small"
elem.text = " ({})".format(elem.text.strip())
if elem.text is not None:
elem.tag = "small"
elem.text = " ({})".format(elem.text.strip())

return [tree]

Expand Down Expand Up @@ -333,6 +335,7 @@ class BaseItemLoader(ItemLoader):
author_name_out = Join(", ")

# Optional
path_in = MapCompose(urlquote_plus)
path_out = Identity()


Expand Down
2 changes: 1 addition & 1 deletion feeds/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import datetime, timezone
import uuid
from datetime import datetime, timezone

from scrapy import signals
from scrapy.exceptions import DropItem
Expand Down
10 changes: 2 additions & 8 deletions feeds/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from scrapy import Request, signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import load_object
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.misc import load_object
from scrapy.utils.request import request_fingerprint

from feeds.exceptions import DropResponse
Expand All @@ -19,13 +19,7 @@ def from_crawler(cls, crawler):

def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
if response.status >= 500:
# Transient errors usually caused by overloaded sites, updates, short
# downtimes, etc.
lgr = logger.info
else:
lgr = logger.warning
lgr(
logger.info(
"Ignoring response %(response)r: HTTP status code is not "
"handled or not allowed",
{"response": response},
Expand Down
43 changes: 32 additions & 11 deletions feeds/spiders/addendum_org.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from copy import deepcopy
from functools import partial

import lxml
Expand All @@ -7,21 +8,29 @@

from feeds.loaders import FeedEntryItemLoader
from feeds.spiders import FeedsXMLFeedSpider
from feeds.utils import generate_feed_header


class AddendumOrgSpider(FeedsXMLFeedSpider):
name = "addendum.org"
start_urls = ["https://www.addendum.org/feed/rss2-addendum"]

feed_title = "Addendum"
feed_subtitle = "das, was fehlt"
feed_link = "https://www.{}".format(name)
feed_icon = (
"https://www.{}/resources/dist/favicons/android-chrome-192x192.png"
).format(name)
_max_articles = 10
_num_articles = 0

def feed_headers(self):
feeds = {"": "Addendum", "podcast": "Addendum Podcast"}
for path, title in feeds.items():
yield generate_feed_header(
title=title,
path=path,
subtitle="das, was fehlt",
link="https://www.{}".format(self.name),
icon=(
"https://www.{}/resources/dist/favicons/android-chrome-192x192.png"
).format(self.name),
)

def parse_node(self, response, node):
url = node.xpath("link/text()").extract_first()
if not node.xpath("category"):
Expand Down Expand Up @@ -145,8 +154,20 @@ def _inline_picture(elem):
il.add_css("updated", 'meta[property="article:published_time"]::attr(content)')
il.add_css("content_html", ".content")
for medium_id, medium_url in media.items():
il.add_value("enclosure_iri", medium_url)
il.add_value(
"enclosure_type", "audio/mp4" if medium_id in audio_ids else "video/mp4"
)
yield il.load_item()
if medium_id not in audio_ids:
il.add_value("enclosure_iri", medium_url)
il.add_value("enclosure_type", "video/mp4")
item = il.load_item()
# Save a copy before yielding it.
item_podcast = deepcopy(item)
yield item

if audio_ids:
# Export to podcast feed.
il = FeedEntryItemLoader(item=item_podcast)
il.add_value("path", "podcast")
for medium_id, medium_url in media.items():
if medium_id in audio_ids:
il.add_value("enclosure_iri", medium_url)
il.add_value("enclosure_type", "audio/mp4")
yield il.load_item()
3 changes: 2 additions & 1 deletion feeds/spiders/ak_ciando_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def parse(self, response):

def parse_item(self, response):
il = FeedEntryItemLoader(
selector=response.xpath('//div[@id="maincontentbook"]'), base_url=self._link
selector=response.xpath('//div[@id="maincontentbook"]'),
base_url=self.feed_link,
)
il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
Expand Down
8 changes: 4 additions & 4 deletions feeds/spiders/biblioweb_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def start_requests(self):
"FEEDS_SPIDER_BIBLIOWEB_AT_LOCATION", ""
).lower()
if self._library:
self._path = self._library
self._title = "Bibliothek {}".format(self._library.title())
self._subtitle = "Neue Titel in der {}".format(self._title)
self._link = "https://www.biblioweb.at/{}/".format(self._library)
self.feed_path = self._library
self.feed_title = "Bibliothek {}".format(self._library.title())
self.feed_subtitle = "Neue Titel in der {}".format(self.feed_title)
self.feed_link = "https://www.biblioweb.at/{}/".format(self._library)
yield scrapy.Request(
"https://www.biblioweb.at/{}/start.asp".format(self._library),
callback=self.parse,
Expand Down
15 changes: 7 additions & 8 deletions feeds/spiders/diepresse_com.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
from urllib.parse import quote_plus as urlquote_plus

import scrapy

Expand All @@ -24,15 +23,16 @@ def start_requests(self):
else:
self._sections = ["all"]
yield scrapy.Request(
"https://diepresse.com/files/sitemaps/news/news-sitemap.xml"
"https://diepresse.com/files/sitemaps/news/news-sitemap.xml",
meta={"dont_cache": True},
)

def feed_headers(self):
for section in self._sections:
yield generate_feed_header(
title="DiePresse.com/{}".format(section),
link="https://{}".format(self.name),
path=urlquote_plus(section),
path=section,
logo="http://diepresse.com/img/diepresse_250x40.png",
)

Expand All @@ -41,10 +41,9 @@ def parse_node(self, response, node):
il = FeedEntryItemLoader(selector=node)
il.add_value("link", url)
il.add_xpath("title", "news:news/news:title/text()")
il.add_value(
"category",
node.xpath("news:news/news:keywords/text()").extract_first().split(", "),
)
keywords = node.xpath("news:news/news:keywords/text()").extract_first()
if keywords:
il.add_value("category", keywords.split(", "))
il.add_xpath("updated", "news:news/news:publication_date/text()")
return scrapy.Request(url, self.parse_item, meta={"il": il})

Expand Down Expand Up @@ -93,5 +92,5 @@ def _clean_caption(elem):
if "all" in self._sections:
il.add_value("path", "all")
if section in self._sections:
il.add_value("path", urlquote_plus(section))
il.add_value("path", section)
return il.load_item()
12 changes: 3 additions & 9 deletions feeds/spiders/generic.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import io
import itertools
from urllib.parse import quote_plus as urlquote_plus, urlparse, urljoin
from urllib.parse import urljoin, urlparse

import feedparser
import readability.readability
import scrapy
from readability.readability import Document, Unparseable

Expand All @@ -11,8 +12,6 @@
from feeds.utils import generate_feed_header

# Readability's output is not that interesting to justify log level "INFO".
import readability.readability

readability.readability.log.info = readability.readability.log.debug


Expand All @@ -31,12 +30,7 @@ def start_requests(self):
zip(fulltext_urls.split(), itertools.repeat(True)),
):
yield scrapy.Request(
url,
meta={
"dont_cache": True,
"fulltext": fulltext,
"path": urlquote_plus(url),
},
url, meta={"dont_cache": True, "fulltext": fulltext, "path": url}
)

def feed_headers(self):
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/gnucash_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class GnucashOrgSpider(FeedsXMLFeedSpider):

def parse_node(self, response, node):
# Reuse most of the existing fields
il = FeedEntryItemLoader(selector=node, base_url=self._link)
il = FeedEntryItemLoader(selector=node, base_url=self.feed_link)
il.add_xpath("title", "atom:title/text()")
il.add_xpath("link", "atom:link/@href")
il.add_xpath("author_name", "atom:author/atom:name/text()")
Expand Down

0 comments on commit 017d14f

Please sign in to comment.