Skip to content

Commit

Permalink
Merge pull request #173 from Lukas0907/caching
Browse files Browse the repository at this point in the history
Allow caching of objects from spider and generation of updated field
  • Loading branch information
Lukas0907 committed Sep 24, 2018
2 parents 9144df0 + f614d04 commit 7fba74d
Show file tree
Hide file tree
Showing 14 changed files with 384 additions and 170 deletions.
1 change: 1 addition & 0 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ scraping from there.
* :ref:`spider_openwrt.org`
* :ref:`spider_puls4.com`
* :ref:`spider_python-patterns.guide`
* :ref:`spider_ubup.com`
* :ref:`spider_usenix.org`
* :ref:`spider_verbraucherrecht.at`
* :ref:`spider_wienerlinien.at`
Expand Down
25 changes: 25 additions & 0 deletions docs/spiders/ubup.com.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
.. _spider_ubup.com:

ubup.com
--------
Items available for buying at `ubup <https://www.ubup.com>`_.

Configuration
~~~~~~~~~~~~~
Add ``ubup.com`` to the list of spiders:

.. code-block:: ini
# List of spiders to run by default, one per line.
spiders =
ubup.com
By default, `newest items <https://www.ubup.com/katalog?sortiertnach=neueste>`_
(from the first three pages) will be included. You can provide a list of links
in case you want to limit the items to a specific brand or size.

.. code-block:: ini
[ubup.com]
links =
/katalog?sortiertnach=neueste
4 changes: 4 additions & 0 deletions feeds.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,7 @@ useragent = feeds (+https://github.com/nblock/feeds)
# software
# telecom
# web

#[ubup.com]
#links =
# /katalog?sortiertnach=neueste
256 changes: 195 additions & 61 deletions feeds/cache.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,210 @@
import hashlib
import logging
import os
import pickle
import shutil
from datetime import datetime, timedelta, timezone

logger = logging.getLogger(__name__)
from collections import defaultdict
from datetime import datetime, timezone
from time import time

from scrapy.extensions.httpcache import FilesystemCacheStorage, DummyPolicy
from scrapy.utils.python import to_bytes
from scrapy.utils.request import request_fingerprint
import scrapy

IGNORE_HTTP_CODES = [403, 404] + list(range(500, 600))
logger = logging.getLogger(__name__)


def read_meta(root):
with open(os.path.join(root, "pickled_meta"), "rb") as f:
return pickle.load(f)
class FeedsCachePolicy(DummyPolicy):
def should_cache_response(self, response, request):
# We cache all responses regardless of HTTP code.
return True


class FeedsCache:
def __init__(self, settings):
if settings.getbool("HTTPCACHE_ENABLED"):
self.storage = FeedsCacheStorage(settings)
else:
self.storage = FeedsCacheInMemoryStorage()

def get(self, spider, key):
return self.storage.retrieve_object(spider, key)

def set(self, spider, key, obj):
return self.storage.store_object(spider, key, obj)

def setdefault(self, spider, key, default_obj):
obj = self.storage.retrieve_object(spider, key)
if obj is not None:
return obj
self.storage.store_object(spider, key, default_obj)
return default_obj

def cleanup(self):
self.storage.cleanup()


class FeedsCacheStorage(FilesystemCacheStorage):
def __init__(self, settings):
super().__init__(settings)
# gzip is not supported
self.use_gzip = False
self._open = open
self.ignore_http_codes = [
int(x) for x in settings.getlist("HTTPCACHE_IGNORE_HTTP_CODES")
]

def retrieve_response(self, spider, request):
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is not None and metadata["status"] in self.ignore_http_codes:
# ignore cache entry for error responses
logger.debug("Response for {} not cached".format(request))
return
# Retrieve response from cache.
try:
return super().retrieve_response(spider, request)
finally:
logger.debug("Retrieved response for {} from cache".format(request))

def store_response(self, spider, request, response):
"""Store the given response in the cache."""
# Read the old metadata.
old_metadata = self._read_meta(spider, request)
# This will overwrite old metadata (if there is one).
super().store_response(spider, request, response)
# Read the new metadata.
metadata = self._read_meta(spider, request)
# Add the parents' fingerprints to the metadata and merge the parents from the
# old metadata. The last fingerprint is not included since it's the fingerprint
# of this request.
metadata["parents"] = list(
set(request.meta["fingerprints"][:-1]).union(
old_metadata["parents"] if old_metadata else []
)
)
if request.meta.get("cache_expires") is not None:
metadata["cache_expires"] = request.meta["cache_expires"].total_seconds()
metadata["type"] = "response"
# Write it back.
rpath = self._get_request_path(spider, request)
self._write_meta_to_path(rpath, metadata)

def _get_request_path(self, spider, request):
key = request_fingerprint(request, include_headers=["Cookie"])
return os.path.join(self.cachedir, spider.name, key[0:2], key)

def retrieve_object(self, spider, key):
metadata = self._read_meta(spider, key)
if metadata is None:
return None
path = self._get_key_path(spider, key)
with self._open(os.path.join(path, "object"), "rb") as f:
return pickle.load(f)

def store_object(self, spider, key, obj):
path = self._get_key_path(spider, key)
if not os.path.exists(path):
os.makedirs(path)
metadata = {"timestamp": time(), "type": "object"}
self._write_meta_to_path(path, metadata)
with self._open(os.path.join(path, "object"), "wb") as f:
pickle.dump(obj, f, protocol=2)

def _get_key_path(self, spider, key):
key = hashlib.sha1(to_bytes(key)).hexdigest()
return os.path.join(self.cachedir, spider.name, key[0:2], key)

def item_dropped(self, item, response, exception, spider):
self.remove_cache_entry(
self._get_request_path(spider, response.request), remove_parents=True
)

def _read_meta(self, spider, key):
if isinstance(key, scrapy.Request):
path = self._get_request_path(spider, key)
else:
path = self._get_key_path(spider, key)
return self._read_meta_from_path(path)

def _read_meta_from_path(self, path):
try:
with open(os.path.join(path, "pickled_meta"), "rb") as f:
return pickle.load(f)
except FileNotFoundError:
return None

def _write_meta_to_path(self, path, metadata):
with self._open(os.path.join(path, "meta"), "wb") as f:
f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(path, "pickled_meta"), "wb") as f:
pickle.dump(metadata, f, protocol=2)

def cleanup(self):
"""Removes cache entries in path.
Entries are removed if one of the conditions is true:
- Response has a certain status code (e.g. 404).
- Individual expiration date is reached (compared to now).
- Timestamp of entry and expires exceeds now.
"""

logger.debug("Cleaning cache entries from {} ...".format(self.cachedir))

now = int(datetime.now(timezone.utc).timestamp())
for cache_entry_path, _dirs, files in os.walk(self.cachedir, topdown=False):
if "pickled_meta" in files:
meta = self._read_meta_from_path(cache_entry_path)
entry_expires_after = min(
meta.get("cache_expires", self.expiration_secs),
self.expiration_secs,
)
threshold = meta["timestamp"] + entry_expires_after
if now > threshold:
self.remove_cache_entry(cache_entry_path)
elif (
meta.get("type", "response") == "response"
and meta["status"] in self.ignore_http_codes
):
self.remove_cache_entry(cache_entry_path, remove_parents=True)
elif not os.path.samefile(cache_entry_path, self.cachedir):
# Try to delete parent directory of cache entries.
try:
os.rmdir(cache_entry_path)
except OSError:
# Not empty, don't care.
pass

logger.debug("Finished cleaning cache entries.")

def remove_cache_entry(self, cache_entry_path, remove_parents=False):
meta = self._read_meta_from_path(cache_entry_path)
if meta is None:
return

if remove_parents:
logger.debug(
"Removing parent cache entries for URL {}".format(meta["response_url"])
)
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
self.remove_cache_entry(path, remove_parents=False)

def cleanup_cache(cache_dir, expires):
"""Removes cache entries in path.
logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
shutil.rmtree(cache_entry_path, ignore_errors=True)

Entries are removed if one of the conditions is true:
- Response has a certain status code (e.g. 404).
- Individual expiration date is reached (compared to now).
- Timestamp of entry and expires exceeds now.
"""

if expires < timedelta(0):
raise ValueError("expires must be a positive timedelta.")
class FeedsCacheInMemoryStorage:
def __init__(self):
self.data = defaultdict(dict)

logger.debug("Cleaning cache entries from {} ...".format(cache_dir))
def retrieve_object(self, spider, key):
return self.data[spider].get(key)

now = datetime.now(timezone.utc)
for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
if "pickled_meta" in files:
meta = read_meta(cache_entry_path)
try:
entry_expires = timedelta(seconds=meta["cache_expires"])
except KeyError:
entry_expires = expires
entry_expires = min(entry_expires, expires)
threshold = (
datetime.fromtimestamp(meta["timestamp"], tz=timezone.utc)
+ entry_expires
)
if now > threshold:
remove_cache_entry(cache_entry_path)
elif meta["status"] in IGNORE_HTTP_CODES:
remove_cache_entry(cache_entry_path, remove_parents=True)
elif not os.path.samefile(cache_entry_path, cache_dir):
# Try to delete parent directory of cache entries.
try:
os.rmdir(cache_entry_path)
except OSError:
# Not empty, don't care.
pass

logger.debug("Finished cleaning cache entries.")


def remove_cache_entry(cache_entry_path, remove_parents=False):
try:
meta = read_meta(cache_entry_path)
except FileNotFoundError:
return

if remove_parents:
logger.debug(
"Removing parent cache entries for URL {}".format(meta["response_url"])
)
spider_root = os.path.dirname(os.path.dirname(cache_entry_path))
for fingerprint in meta["parents"]:
path = os.path.join(spider_root, fingerprint[0:2], fingerprint)
remove_cache_entry(path, remove_parents=False)
def store_object(self, spider, key, obj):
self.data[spider][key] = obj

logger.debug("Removing cache entry for URL {}".format(meta["response_url"]))
shutil.rmtree(cache_entry_path, ignore_errors=True)
def cleanup(self):
pass
11 changes: 4 additions & 7 deletions feeds/cli.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
import logging
import os
from datetime import timedelta

import click
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from scrapy.utils.project import data_path, get_project_settings
from scrapy.utils.project import get_project_settings
from twisted.python import failure

from feeds.cache import cleanup_cache
from feeds.cache import FeedsCache
from feeds.settings import load_feeds_settings

logger = logging.getLogger(__name__)


def run_cleanup_cache(settings):
days = settings.getint("FEEDS_CONFIG_CACHE_EXPIRES")
if days <= 0:
raise ValueError("cache_expires must be >= 0.")
cleanup_cache(data_path(settings.get("HTTPCACHE_DIR")), timedelta(days=days))
cache = FeedsCache(settings)
cache.cleanup()


def spiders_to_crawl(process, argument_spiders):
Expand Down
9 changes: 4 additions & 5 deletions feeds/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,11 @@
}

HTTPCACHE_ENABLED = True
HTTPCACHE_STORAGE = "feeds.extensions.FeedsCacheStorage"
HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
HTTPCACHE_STORAGE = "feeds.cache.FeedsCacheStorage"
HTTPCACHE_POLICY = "feeds.cache.FeedsCachePolicy"
HTTPCACHE_DIR = save_cache_path("feeds")
# We cache everything and delete cache entries (and every parent request) during
# cleanup.
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_EXPIRATION_SECS = FEEDS_CONFIG_CACHE_EXPIRES * 24 * 60 * 60
HTTPCACHE_IGNORE_HTTP_CODES = [403, 404] + list(range(500, 600))

# Default user agent. Can be overriden in feeds.cfg.
USER_AGENT = "feeds (+https://github.com/nblock/feeds)"
Expand Down

0 comments on commit 7fba74d

Please sign in to comment.