Merge pull request #143 from Lukas0907/cache-expiration

Allow individual cache expiration for requests
PyFeeds · Aug 7, 2018 · f196905 · f196905
2 parents f9ddcb4 + b69f045
commit f196905
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 16 deletions.
diff --git a/docs/spiders/lwn.net.rst b/docs/spiders/lwn.net.rst
@@ -5,8 +5,7 @@ lwn.net
 Newest articles from LWN_ with special treatment of LWN_ Weekly Editions.
 Please note that LWN_ requires the cache to be enabled to minimize useless
 requests. In case you provide username and password, the session (cookie) is
-also cached until the cache entry expires. The session cookie is valid for a
-month so to avoid disruptions, set the cache expiry time to less than that.
+also cached until the cache entry expires.
 
 Configuration
 ~~~~~~~~~~~~~

diff --git a/feeds/cache.py b/feeds/cache.py
@@ -2,7 +2,7 @@
 import os
 import pickle
 import shutil
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 
 logger = logging.getLogger(__name__)
 
@@ -15,16 +15,38 @@ def read_meta(root):
         return pickle.load(f)
 
 
-def cleanup_cache(cache_dir, max_age):
-    """ Removes cache entries in path that are older than max_age. """
+def cleanup_cache(cache_dir, expires):
+    """Removes cache entries in path.
+
+    Entries are removed if one of the conditions is true:
+      - Response has a certain status code (e.g. 404).
+      - Individual expiration date is reached (compared to now).
+      - Timestamp of entry and expires exceeds now.
+    """
+
+    if expires < timedelta(0):
+        raise ValueError("expires must be a positive timedelta.")
 
     logger.debug("Cleaning cache entries from {} ...".format(cache_dir))
 
+    now = datetime.now(timezone.utc)
     for cache_entry_path, _dirs, files in os.walk(cache_dir, topdown=False):
         if "pickled_meta" in files:
             meta = read_meta(cache_entry_path)
-            timestamp = datetime.fromtimestamp(meta["timestamp"])
-            if timestamp < max_age:
+            logger.debug("Checking cache entry for URL {}".format(meta["response_url"]))
+            try:
+                entry_expires = timedelta(seconds=meta["cache_expires"])
+            except KeyError:
+                entry_expires = expires
+            entry_expires = min(entry_expires, expires)
+            threshold = (
+                datetime.fromtimestamp(meta["timestamp"], tz=timezone.utc)
+                + entry_expires
+            )
+            logger.debug(
+                "Entry expires after {} at {}".format(entry_expires, threshold)
+            )
+            if now > threshold:
                 remove_cache_entry(cache_entry_path)
             elif meta["status"] in IGNORE_HTTP_CODES:
                 remove_cache_entry(cache_entry_path, remove_parents=True)

diff --git a/feeds/cli.py b/feeds/cli.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from datetime import datetime, timedelta
+from datetime import timedelta
 
 import click
 from scrapy.crawler import CrawlerProcess
@@ -16,9 +16,9 @@
 
 def run_cleanup_cache(settings):
     days = settings.getint("FEEDS_CONFIG_CACHE_EXPIRES")
-    cleanup_cache(
-        data_path(settings.get("HTTPCACHE_DIR")), datetime.now() - timedelta(days=days)
-    )
+    if days <= 0:
+        raise ValueError("cache_expires must be >= 0.")
+    cleanup_cache(data_path(settings.get("HTTPCACHE_DIR")), timedelta(days=days))
 
 
 def spiders_to_crawl(process, argument_spiders):

diff --git a/feeds/extensions.py b/feeds/extensions.py
@@ -39,6 +39,11 @@ def store_response(self, spider, request, response):
                 old_metadata["parents"] if old_metadata else []
             )
         )
+        if (
+            "cache_expires" in request.meta
+            and request.meta["cache_expires"] is not None
+        ):
+            metadata["cache_expires"] = request.meta["cache_expires"].total_seconds()
         # Write it back.
         rpath = self._get_request_path(spider, request)
         with self._open(os.path.join(rpath, "meta"), "wb") as f:

diff --git a/feeds/spiders/derstandard_at.py b/feeds/spiders/derstandard_at.py
@@ -1,3 +1,5 @@
+from datetime import timedelta
+
 import scrapy
 
 from feeds.loaders import FeedEntryItemLoader
@@ -16,7 +18,7 @@ class DerStandardAtSpider(FeedsXMLFeedSpider):
     _logo = "https://at.staticfiles.at/sites/mainweb/img/icons/dst/dst-228.png"
     _titles = {}
     # Some ressorts have articles that are regulary updated, e.g. cartoons.
-    _ressorts_uncached = ["47"]
+    _cache_expires = {"47": timedelta(minutes=60)}
     _max_articles = 10
     _ressorts_num_articles = {}
 
@@ -58,14 +60,14 @@ def parse_node(self, response, node):
         self._ressorts_num_articles[response.meta["ressort"]] = num_articles + 1
 
         updated = node.xpath("pubDate/text()").extract_first()
-        dont_cache = response.meta["ressort"] in self._ressorts_uncached
+        cache_expires = self._cache_expires.get(response.meta["ressort"])
         yield scrapy.Request(
             url,
             self._parse_article,
             meta={
                 "updated": updated,
                 "ressort": response.meta["ressort"],
-                "dont_cache": dont_cache,
+                "cache_expires": cache_expires,
             },
             # Cookie handling is disabled, so we have to send this as a header.
             headers={"Cookie": "DSGVO_ZUSAGE_V1=true"},

diff --git a/feeds/spiders/falter_at.py b/feeds/spiders/falter_at.py
@@ -39,7 +39,7 @@ def start_requests(self):
                     },
                     meta={
                         "dont_redirect": True,
-                        "dont_cache": True,
+                        "cache_expires": timedelta(days=1),
                         "handle_httpstatus_list": [302],
                     },
                     callback=self.request_archive,

diff --git a/feeds/spiders/lwn_net.py b/feeds/spiders/lwn_net.py
@@ -1,4 +1,5 @@
 import re
+from datetime import timedelta
 
 import scrapy
 from dateutil.parser import parse as dateutil_parse
@@ -90,6 +91,8 @@ def start_requests(self):
                     "submit": "Log+in",
                 },
                 callback=self._after_login,
+                # Session cookie is valid for a month. 14 days is a good compromise.
+                meta={"cache_expires": timedelta(days=14)},
             )
         else:
             # Username, password or section not found in feeds.cfg.

diff --git a/feeds/spiders/uebermedien_de.py b/feeds/spiders/uebermedien_de.py
@@ -1,5 +1,6 @@
 import json
 from collections import OrderedDict
+from datetime import timedelta
 from urllib.parse import parse_qs, urlparse
 
 import scrapy
@@ -29,6 +30,7 @@ def start_requests(self):
                 + "redirect_uri=https://uebermedien.de&scope=read&"
                 + "response_type=code&refresh_only=false",
                 callback=self._steady_login,
+                meta={"cache_expires": timedelta(days=1)},
             )
         else:
             self.logger.info("Login failed: No username or password given")
@@ -41,7 +43,7 @@ def _steady_login(self, response):
             formdata={"user[email]": self._username, "user[password]": self._password},
             callback=self._request_steady_token,
             dont_filter=True,
-            meta={"handle_httpstatus_list": [301]},
+            meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},
         )
 
     def _request_steady_token(self, response):
@@ -65,6 +67,7 @@ def _request_steady_token(self, response):
             body=json.dumps(body),
             headers={"Accept": "application/json", "Content-Type": "application/json"},
             callback=self._set_steady_token,
+            meta={"cache_expires": timedelta(days=1)},
         )
 
     def _set_steady_token(self, response):