Skip to content

Commit

Permalink
Merge pull request #143 from Lukas0907/cache-expiration
Browse files Browse the repository at this point in the history
Allow individual cache expiration for requests
  • Loading branch information
Lukas0907 committed Aug 7, 2018
2 parents f9ddcb4 + b69f045 commit f196905
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 16 deletions.
3 changes: 1 addition & 2 deletions docs/spiders/lwn.net.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ lwn.net
Newest articles from LWN_ with special treatment of LWN_ Weekly Editions.
Please note that LWN_ requires the cache to be enabled to minimize useless
requests. In case you provide username and password, the session (cookie) is
also cached until the cache entry expires. The session cookie is valid for a
month so to avoid disruptions, set the cache expiry time to less than that.
also cached until the cache entry expires.

Configuration
~~~~~~~~~~~~~
Expand Down
32 changes: 27 additions & 5 deletions feeds/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import pickle
import shutil
from datetime import datetime
from datetime import datetime, timedelta, timezone

logger = logging.getLogger(__name__)

Expand All @@ -15,16 +15,38 @@ def read_meta(root):
return pickle.load(f)


def cleanup_cache(cache_dir, max_age):
""" Removes cache entries in path that are older than max_age. """
def cleanup_cache(cache_dir, expires):
"""Removes cache entries in path.
Entries are removed if one of the conditions is true:
- Response has a certain status code (e.g. 404).
- Individual expiration date is reached (compared to now).
- Timestamp of entry and expires exceeds now.
"""

if expires < timedelta(0):
raise ValueError("expires must be a positive timedelta.")

logger.debug("Cleaning cache entries from {} ...".format(cache_dir))

now = datetime.now(timezone.utc)
for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
if "pickled_meta" in files:
meta = read_meta(cache_entry_path)
timestamp = datetime.fromtimestamp(meta["timestamp"])
if timestamp < max_age:
logger.debug("Checking cache entry for URL {}".format(meta["response_url"]))
try:
entry_expires = timedelta(seconds=meta["cache_expires"])
except KeyError:
entry_expires = expires
entry_expires = min(entry_expires, expires)
threshold = (
datetime.fromtimestamp(meta["timestamp"], tz=timezone.utc)
+ entry_expires
)
logger.debug(
"Entry expires after {} at {}".format(entry_expires, threshold)
)
if now > threshold:
remove_cache_entry(cache_entry_path)
elif meta["status"] in IGNORE_HTTP_CODES:
remove_cache_entry(cache_entry_path, remove_parents=True)
Expand Down
8 changes: 4 additions & 4 deletions feeds/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import os
from datetime import datetime, timedelta
from datetime import timedelta

import click
from scrapy.crawler import CrawlerProcess
Expand All @@ -16,9 +16,9 @@

def run_cleanup_cache(settings):
days = settings.getint("FEEDS_CONFIG_CACHE_EXPIRES")
cleanup_cache(
data_path(settings.get("HTTPCACHE_DIR")), datetime.now() - timedelta(days=days)
)
if days <= 0:
raise ValueError("cache_expires must be >= 0.")
cleanup_cache(data_path(settings.get("HTTPCACHE_DIR")), timedelta(days=days))


def spiders_to_crawl(process, argument_spiders):
Expand Down
5 changes: 5 additions & 0 deletions feeds/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ def store_response(self, spider, request, response):
old_metadata["parents"] if old_metadata else []
)
)
if (
"cache_expires" in request.meta
and request.meta["cache_expires"] is not None
):
metadata["cache_expires"] = request.meta["cache_expires"].total_seconds()
# Write it back.
rpath = self._get_request_path(spider, request)
with self._open(os.path.join(rpath, "meta"), "wb") as f:
Expand Down
8 changes: 5 additions & 3 deletions feeds/spiders/derstandard_at.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import timedelta

import scrapy

from feeds.loaders import FeedEntryItemLoader
Expand All @@ -16,7 +18,7 @@ class DerStandardAtSpider(FeedsXMLFeedSpider):
_logo = "https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-228.png"
_titles = {}
# Some ressorts have articles that are regulary updated, e.g. cartoons.
_ressorts_uncached = ["47"]
_cache_expires = {"47": timedelta(minutes=60)}
_max_articles = 10
_ressorts_num_articles = {}

Expand Down Expand Up @@ -58,14 +60,14 @@ def parse_node(self, response, node):
self._ressorts_num_articles[response.meta["ressort"]] = num_articles + 1

updated = node.xpath("pubDate/text()").extract_first()
dont_cache = response.meta["ressort"] in self._ressorts_uncached
cache_expires = self._cache_expires.get(response.meta["ressort"])
yield scrapy.Request(
url,
self._parse_article,
meta={
"updated": updated,
"ressort": response.meta["ressort"],
"dont_cache": dont_cache,
"cache_expires": cache_expires,
},
# Cookie handling is disabled, so we have to send this as a header.
headers={"Cookie": "DSGVO_ZUSAGE_V1=true"},
Expand Down
2 changes: 1 addition & 1 deletion feeds/spiders/falter_at.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def start_requests(self):
},
meta={
"dont_redirect": True,
"dont_cache": True,
"cache_expires": timedelta(days=1),
"handle_httpstatus_list": [302],
},
callback=self.request_archive,
Expand Down
3 changes: 3 additions & 0 deletions feeds/spiders/lwn_net.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from datetime import timedelta

import scrapy
from dateutil.parser import parse as dateutil_parse
Expand Down Expand Up @@ -90,6 +91,8 @@ def start_requests(self):
"submit": "Log+in",
},
callback=self._after_login,
# Session cookie is valid for a month. 14 days is a good compromise.
meta={"cache_expires": timedelta(days=14)},
)
else:
# Username, password or section not found in feeds.cfg.
Expand Down
5 changes: 4 additions & 1 deletion feeds/spiders/uebermedien_de.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from collections import OrderedDict
from datetime import timedelta
from urllib.parse import parse_qs, urlparse

import scrapy
Expand Down Expand Up @@ -29,6 +30,7 @@ def start_requests(self):
+ "redirect_uri=https://uebermedien.de&scope=read&"
+ "response_type=code&refresh_only=false",
callback=self._steady_login,
meta={"cache_expires": timedelta(days=1)},
)
else:
self.logger.info("Login failed: No username or password given")
Expand All @@ -41,7 +43,7 @@ def _steady_login(self, response):
formdata={"user[email]": self._username, "user[password]": self._password},
callback=self._request_steady_token,
dont_filter=True,
meta={"handle_httpstatus_list": [301]},
meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},
)

def _request_steady_token(self, response):
Expand All @@ -65,6 +67,7 @@ def _request_steady_token(self, response):
body=json.dumps(body),
headers={"Accept": "application/json", "Content-Type": "application/json"},
callback=self._set_steady_token,
meta={"cache_expires": timedelta(days=1)},
)

def _set_steady_token(self, response):
Expand Down

0 comments on commit f196905

Please sign in to comment.