Skip to content

Commit

Permalink
Merge pull request #126 from Lukas0907/caching
Browse files Browse the repository at this point in the history
Introduce custom cache storage that handles HTTP errors better.
  • Loading branch information
Lukas0907 committed Jul 19, 2018
2 parents 449d3bf + 72bf053 commit 89bfd1f
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 39 deletions.
41 changes: 32 additions & 9 deletions feeds/cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import logging
import os
import pickle
import shutil
from datetime import datetime

logger = logging.getLogger(__name__)


def _read_meta(root):
IGNORE_HTTP_CODES = [404, 500, 502, 503, 504]


def read_meta(root):
with open(os.path.join(root, "pickled_meta"), "rb") as f:
return pickle.load(f)

Expand All @@ -16,21 +20,40 @@ def cleanup_cache(cache_dir, max_age):

logger.debug("Cleaning cache entries from {} ...".format(cache_dir))

for root, _dirs, files in os.walk(cache_dir, topdown=False):
for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
if "pickled_meta" in files:
meta = _read_meta(root)
meta = read_meta(cache_entry_path)

timestamp = datetime.fromtimestamp(meta["timestamp"])
if timestamp < max_age:
remove_cache_entry(cache_entry_path, meta["response_url"])
elif meta["status"] in IGNORE_HTTP_CODES:
remove_cache_entry(cache_entry_path, meta["response_url"])
logger.debug(
"Removing cache entry for URL {}".format(meta["response_url"])
"Removing parent cache entries for URL {}".format(
meta["response_url"]
)
)
for name in files:
os.remove(os.path.join(root, name))
os.rmdir(root)
elif not os.path.samefile(root, cache_dir):
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
# Remove parents as well.
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
remove_cache_entry(path, read_meta(path)["response_url"])
elif not os.path.samefile(cache_entry_path, cache_dir):
# Try to delete parent directory of cache entries.
try:
os.rmdir(root)
os.rmdir(cache_entry_path)
except OSError:
# Not empty, don't care.
pass

logger.debug("Finished cleaning cache entries.")


def remove_cache_entry(cache_entry_path, url):
if os.path.exists(cache_entry_path):
logger.debug("Removing cache entry for URL {}".format(url))
shutil.rmtree(cache_entry_path)
else:
logger.error("Cannot remove cache entry {} for URL {}".format(
cache_entry_path, url))
45 changes: 45 additions & 0 deletions feeds/extensions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
import os

import pickle
from scrapy import signals
from scrapy.extensions.httpcache import FilesystemCacheStorage
from scrapy.utils.python import to_bytes

from feeds.cache import IGNORE_HTTP_CODES


class SpiderSettings:
Expand All @@ -14,3 +21,41 @@ def spider_opened(self, spider):
@classmethod
def spider_settings(cls, spider):
return spider.settings.get("FEEDS_CONFIG").get(spider.name, {})


class FeedsCacheStorage(FilesystemCacheStorage):
def __init__(self, settings):
super().__init__(settings)
# gzip is not supported
self.use_gzip = False
self._open = open

def retrieve_response(self, spider, request):
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is not None and metadata["status"] in IGNORE_HTTP_CODES:
return # ignore cache entry for error responses
# Retrieve response from cache.
return super().retrieve_response(spider, request)

def store_response(self, spider, request, response):
"""Store the given response in the cache."""
# Read the old metadata.
old_metadata = self._read_meta(spider, request)
# This will overwrite old metadata (if there is one).
super().store_response(spider, request, response)
# Read the new metadata.
metadata = self._read_meta(spider, request)
# Add the parents' fingerprints to the metadata and merge the parents from the
# old metadata.
metadata["parents"] = list(
set(request.meta["fingerprints"]).union(
old_metadata["parents"] if old_metadata else []
)
)
# Write it back.
rpath = self._get_request_path(spider, request)
with self._open(os.path.join(rpath, "meta"), "wb") as f:
f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
pickle.dump(metadata, f, protocol=2)
27 changes: 0 additions & 27 deletions feeds/middlewares.py

This file was deleted.

10 changes: 7 additions & 3 deletions feeds/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,17 @@

EXTENSIONS = {"feeds.extensions.SpiderSettings": 500}

SPIDER_MIDDLEWARES = {"feeds.middlewares.FeedsHttpErrorMiddleware": 51}
SPIDER_MIDDLEWARES = {
"feeds.spidermiddlewares.FeedsHttpErrorMiddleware": 51,
"feeds.spidermiddlewares.FeedsHttpCacheMiddleware": 1000,
}

HTTPCACHE_ENABLED = False
HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
HTTPCACHE_STORAGE = "feeds.extensions.FeedsCacheStorage"
HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
HTTPCACHE_DIR = "cache"
HTTPCACHE_IGNORE_HTTP_CODES = [404, 500, 502, 503, 504]
# Never cache redirects since they are not processed by the FeedsHttpCacheMiddleware.
HTTPCACHE_IGNORE_HTTP_CODES = [301, 302, 303, 307, 308]

# Default user agent. Can be overriden in feeds.cfg.
USER_AGENT = "feeds (+https://github.com/nblock/feeds)"
Expand Down
54 changes: 54 additions & 0 deletions feeds/spidermiddlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from copy import copy
import logging

from scrapy import Request
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.request import request_fingerprint

logger = logging.getLogger(__name__)


class FeedsHttpErrorMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls()

def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
if response.status in [500, 502, 503, 504]:
# These status codes are usually induced by overloaded sites,
# updates, short downtimes, etc. and are not that relevant.
lgr = logger.info
else:
lgr = logger.warning
lgr(
"Ignoring response %(response)r: HTTP status code is not "
"handled or not allowed",
{"response": response},
extra={"spider": spider},
)
return []


class FeedsHttpCacheMiddleware:
@classmethod
def from_crawler(cls, crawler):
return cls()

def process_spider_output(self, response, result, spider):
def _set_fingerprint(response, r):
if isinstance(r, Request):
try:
r.meta["fingerprints"] = copy(response.request.meta["fingerprints"])
except KeyError:
r.meta["fingerprints"] = []
if not response.request.meta.get("dont_cache", False):
fingerprint = request_fingerprint(response.request)
r.meta["fingerprints"].append(fingerprint)
logger.debug("Request fingerprints for request {}: {}".format(
r, r.meta["fingerprints"]))
else:
logger.debug("Skipping fingerprinting uncached request {}".format(
response.request))
return r
return (_set_fingerprint(response, r) for r in result or ())

0 comments on commit 89bfd1f

Please sign in to comment.