Skip to content

Commit

Permalink
Merge pull request #175 from Lukas0907/next
Browse files Browse the repository at this point in the history
Disable dupe filter, remove manually generated updated fields
  • Loading branch information
Lukas0907 committed Sep 30, 2018
2 parents 7fba74d + ff08d98 commit 824baca
Show file tree
Hide file tree
Showing 45 changed files with 604 additions and 375 deletions.
4 changes: 2 additions & 2 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ Writing a spider is easy! Consider the slightly simplified spider for
class IndieHackersComSpider(FeedsSpider):
name = "indiehackers.com"
allowed_domains = [name]
start_urls = ["https://www.indiehackers.com/interviews/page/1"]
_title = "Indie Hackers"
feed_title = "Indie Hackers"
def parse(self, response):
interview_links = response.css(".interview__link::attr(href)").extract()
Expand Down Expand Up @@ -120,6 +119,7 @@ Utilizing the sitemap
~~~~~~~~~~~~~~~~~~~~~
Others provide a sitemap_ which we can parse:

* :ref:`spider_diepresse.com`
* :ref:`spider_profil.at`

Custom extraction
Expand Down
30 changes: 30 additions & 0 deletions docs/spiders/diepresse.com.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
.. _spider_diepresse.com:

diepresse.com
-------------
Newest articles from DiePresse.com_.

Configuration
~~~~~~~~~~~~~
Add ``diepresse.com`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
diepresse.com
diepresse.com supports different sections (ressorts) via the ``sections``
parameter (one per line). If no section is given, ``all`` is used which is a
catch-all section that includes all articles. Sections must match exactly, not
only partially.

Example configuration:

.. code-block:: ini
[diepresse.com]
sections =
Meinung/Pizzicato
.. _DiePresse.com: https://www.diepresse.com
4 changes: 4 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,7 @@ useragent = feeds (+https://github.com/nblock/feeds)
#[ubup.com]
#links =
# /katalog?sortiertnach=neueste

#[diepresse.com]
#sections =
# Meinung/Pizzicato
2 changes: 1 addition & 1 deletion feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def _get_key_path(self, spider, key):
key = hashlib.sha1(to_bytes(key)).hexdigest()
return os.path.join(self.cachedir, spider.name, key[0:2], key)

def item_dropped(self, item, response, exception, spider):
def remove_response(self, response, spider):
self.remove_cache_entry(
self._get_request_path(spider, response.request), remove_parents=True
)
Expand Down
8 changes: 8 additions & 0 deletions feeds/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@
HTTPCACHE_EXPIRATION_SECS = FEEDS_CONFIG_CACHE_EXPIRES * 24 * 60 * 60
HTTPCACHE_IGNORE_HTTP_CODES = [403, 404] + list(range(500, 600))

RETRY_ENABLED = True
# equals 5 requests in total
RETRY_TIMES = 4

# Don't filter duplicates.
# Spiders sometimes produce feeds with potentially overlapping items.
DUPEFILTER_CLASS = "scrapy.dupefilters.BaseDupeFilter"

# Default user agent. Can be overriden in feeds.cfg.
USER_AGENT = "feeds (+https://github.com/nblock/feeds)"

Expand Down
2 changes: 1 addition & 1 deletion feeds/downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ def from_crawler(cls, crawler):
return o

def item_dropped(self, item, response, exception, spider):
self.storage.item_dropped(item, response, exception, spider)
self.storage.remove_response(response, spider)
6 changes: 6 additions & 0 deletions feeds/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@ class FeedsException(Exception):
"""
Base exception class for all exceptions thrown by Feeds.
"""


class DropResponse(FeedsException):
def __init__(self, message, transient=False):
self.transient = transient
super().__init__(message)
24 changes: 16 additions & 8 deletions feeds/exporters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
from copy import deepcopy
from urllib.parse import urljoin

from lxml import etree
Expand Down Expand Up @@ -190,11 +191,18 @@ def finish_exporting(self):
)

def export_item(self, item):
path = os.path.join(self._name, item.pop("path", ""), "feed.atom")
if path not in self._feeds:
if self._output_url:
link_self = urljoin(self._output_url, path)
else:
link_self = None
self._feeds[path] = self.AtomFeed(exporter=self, link_self=link_self)
self._feeds[path].add_item(item)
for path in item.pop("path", [""]):
path = os.path.join(self._name, path, "feed.atom")
if path not in self._feeds:
if self._output_url:
link_self = urljoin(self._output_url, path)
else:
link_self = None
self._feeds[path] = self.AtomFeed(exporter=self, link_self=link_self)
# add_item() is destructive, so add a copy.
self._feeds[path].add_item(deepcopy(item))

# Pop content fields since we don't want to have them in scrapy's debug
# output.
item.pop("content_html", None)
item.pop("content_text", None)
77 changes: 58 additions & 19 deletions feeds/loaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import html
import logging
import os
import re
from copy import deepcopy
from datetime import datetime
Expand All @@ -14,14 +13,18 @@
from lxml.cssselect import CSSSelector
from lxml.html.clean import Cleaner
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Compose, Identity, Join, MapCompose, TakeFirst
from scrapy.loader.processors import Compose, Join, MapCompose, TakeFirst, Identity
from w3lib.html import remove_tags

from feeds.items import FeedEntryItem, FeedItem
from feeds.settings import get_feeds_settings

logger = logging.getLogger(__name__)

_lxml_cleaner = Cleaner(
scripts=True, javascript=True, comments=True, style=True, inline_style=True
)


def parse_datetime(date_time, loader_context):
if isinstance(date_time, datetime):
Expand Down Expand Up @@ -84,7 +87,7 @@ def make_links_absolute(tree):
return [tree]


def cleanup_html(tree, loader_context):
def pullup_elems(tree, loader_context):
for elem_child, parent_dist in loader_context.get("pullup_elems", {}).items():
selector = CSSSelector(elem_child)
for elem in selector(tree):
Expand All @@ -101,16 +104,39 @@ def cleanup_html(tree, loader_context):
)
)

for elem_sel, elem_new in loader_context.get("replace_elems", {}).items():
elem_new = lxml.html.fragment_fromstring(elem_new)
return [tree]


def replace_elems(tree, loader_context):
for elem_sel, elem_repl in loader_context.get("replace_elems", {}).items():
selector = CSSSelector(elem_sel)
for elem in selector(tree):
# New element could be replaced more than once but every node must be a
# different element.
elem_new_copy = deepcopy(elem_new)
elem_new_copy.tail = elem.tail
elem.getparent().replace(elem, elem_new_copy)
# If elem_repl is callable, call it to create a new element (or just modify
# the old one).
if callable(elem_repl):
elem_new = elem_repl(elem)
else:
elem_new = elem_repl

# The new element is None, just remove the old one.
if elem_new is None:
elem.drop_tree()
else:
if isinstance(elem_new, str):
# The new element is a string, create a proper element out of it.
elem_new = lxml.html.fragment_fromstring(elem_new)
else:
# Create a copy of elem_new in case the element should be used as a
# replacement more than once.
elem_new = deepcopy(elem_new)
# Take care to preserve the tail of the old element.
elem_new.tail = elem.tail
elem.getparent().replace(elem, elem_new)

return [tree]


def remove_elems(tree, loader_context):
remove_elems = []

settings = get_feeds_settings()
Expand All @@ -128,6 +154,10 @@ def cleanup_html(tree, loader_context):
for elem in tree.xpath(elem_sel):
elem.drop_tree()

return [tree]


def change_attribs(tree, loader_context):
# Change attrib names.
for elem_sel, attribs in loader_context.get("change_attribs", {}).items():
selector = CSSSelector(elem_sel)
Expand All @@ -139,13 +169,20 @@ def cleanup_html(tree, loader_context):
# If attribs[attrib] is None, attrib is removed instead of
# renamed.
elem.attrib[attribs[attrib]] = old_attrib_value
return [tree]


def change_tags(tree, loader_context):
# Change tag names.
for elem_sel, elem_tag in loader_context.get("change_tags", {}).items():
selector = CSSSelector(elem_sel)
for elem in selector(tree):
elem.tag = elem_tag

return [tree]


def cleanup_html(tree, loader_context):
# tree.iter() iterates over the tree including the root node.
for elem in tree.iter():
# Remove class and id attribute from all elements which are not needed
Expand All @@ -161,10 +198,7 @@ def cleanup_html(tree, loader_context):


def lxml_cleaner(tree):
cleaner = Cleaner(
scripts=True, javascript=True, comments=True, style=True, inline_style=True
)
cleaner(tree)
_lxml_cleaner(tree)
return [tree]


Expand All @@ -174,15 +208,15 @@ def convert_footnotes(tree, loader_context):
selector = CSSSelector(elem_sel)
for elem in selector(tree):
elem.tag = "small"
elem.text = " ({})".format(elem.text)
elem.text = " ({})".format(elem.text.strip())

return [tree]


def convert_iframes(tree, loader_context):
"""Convert iframes to divs with links to its src.
convert_iframes() is called after cleanup_html() so that unwanted iframes can be
convert_iframes() is called after remove_elems() so that unwanted iframes can be
eliminated first.
"""
base_url = loader_context.get("base_url", None) if loader_context else None
Expand Down Expand Up @@ -287,7 +321,7 @@ class BaseItemLoader(ItemLoader):
# Defaults
# Unescape twice to get rid of &&xxx; encoding errors.
default_input_processor = MapCompose(
skip_false, str.strip, html.unescape, html.unescape
str.strip, skip_false, html.unescape, html.unescape
)
default_output_processor = TakeFirst()

Expand All @@ -299,7 +333,7 @@ class BaseItemLoader(ItemLoader):
author_name_out = Join(", ")

# Optional
path_out = Join(os.sep)
path_out = Identity()


class FeedItemLoader(BaseItemLoader):
Expand All @@ -318,6 +352,10 @@ class FeedEntryItemLoader(BaseItemLoader):
replace_regex,
build_tree,
convert_footnotes,
replace_elems,
remove_elems,
change_attribs,
change_tags,
cleanup_html,
convert_iframes,
lxml_cleaner,
Expand All @@ -327,4 +365,5 @@ class FeedEntryItemLoader(BaseItemLoader):
)
content_html_out = Compose(Join(), truncate_text)

category_out = Identity()
# Use sorted to keep the output stable.
category_out = Compose(set, sorted)
25 changes: 24 additions & 1 deletion feeds/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
from copy import copy

from scrapy import Request, signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import load_object
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.request import request_fingerprint

from feeds.exceptions import DropResponse

logger = logging.getLogger(__name__)


Expand All @@ -31,12 +35,18 @@ def process_spider_exception(self, response, exception, spider):


class FeedsHttpCacheMiddleware:
def __init__(self, settings):
if not settings.getbool("HTTPCACHE_ENABLED"):
raise NotConfigured
self.storage = load_object(settings["HTTPCACHE_STORAGE"])(settings)

@classmethod
def from_crawler(cls, crawler):
mw = cls()
mw = cls(crawler.settings)

# Note: this hook is a bit of a hack to intercept redirections
crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)

return mw

def process_spider_output(self, response, result, spider):
Expand Down Expand Up @@ -66,3 +76,16 @@ def request_scheduled(self, request, spider):
request.meta["fingerprints"].append(fingerprint)
else:
logger.debug("Skipping fingerprinting uncached request {}".format(request))

def process_spider_exception(self, response, exception, spider):
# Note that due to Scrapy bug #220 this is *not* called if DropResponse is
# raised from a generator.
# See also https://github.com/scrapy/scrapy/issues/220.
if isinstance(exception, DropResponse):
if exception.transient:
lgr = logger.info
else:
lgr = logger.warning
lgr(exception)
self.storage.remove_response(response, spider)
return []

0 comments on commit 824baca

Please sign in to comment.